# Attempt at modelling using complaints data

#### We begin by cleaning up the data and processing it

In [69]:
# load packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import warnings
# warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [70]:
try:
  df = pd.read_csv('./NYPD_Complaint_Data_Current__Year_To_Date.csv')
  print("Data loaded successfully.")
except FileNotFoundError:
  print("Error: File not found. Please check the file path.")
except pd.errors.EmptyDataError:
  print("Error: The CSV file is empty.")
except pd.errors.ParserError:
  print("Error: There was an issue parsing the CSV file. Please ensure it's a valid CSV.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")

df.head(5)

Data loaded successfully.


Unnamed: 0,CMPLNT_NUM,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,CRM_ATPT_CPTD_CD,HADEVELOPT,HOUSING_PSA,...,TRANSIT_DISTRICT,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon,New Georeferenced Column
0,289459075,47,BRONX,07/02/2024,19:20:00,,(null),COMPLETED,(null),,...,,25-44,BLACK,F,1026480.0,262584.0,40.887314,-73.847272,"(40.8873136344706, -73.8472717577564)",POINT (-73.8472717577564 40.8873136344706)
1,287853116H1,61,BROOKLYN,06/01/2024,02:40:00,,(null),COMPLETED,SHEEPSHEAD BAY,1757.0,...,,18-24,BLACK,M,,,,,,
2,283436472,123,STATEN ISLAND,03/08/2024,21:20:00,03/08/2024,21:21:00,COMPLETED,(null),,...,,25-44,WHITE HISPANIC,F,934640.0,143161.0,40.559481,-74.178539,"(40.559481, -74.178539)",POINT (-74.178539 40.559481)
3,284809655,120,STATEN ISLAND,04/04/2024,19:50:00,04/04/2024,20:02:00,COMPLETED,(null),,...,,UNKNOWN,UNKNOWN,D,949942.0,170296.0,40.634036,-74.123607,"(40.634036, -74.123607)",POINT (-74.123607 40.634036)
4,288019777,123,STATEN ISLAND,06/05/2024,15:00:00,06/05/2024,15:30:00,COMPLETED,(null),,...,,25-44,WHITE,F,924768.0,134938.0,40.536852,-74.213994,"(40.536852, -74.213994)",POINT (-74.213994 40.536852)


In [71]:
# df.describe().T
# df.JURISDICTION_CODE.value_counts()

In [72]:
# Features we might want to keep = ADDR_PCT_CD, BORO_NM, CMPLNT_FR_DT, CMPLNT_FR_TIME, OFNS_DESC, LAW_CAT_DT,STATION_NAME, Latitude, Longitude
# MAYBE - JURISDICTION_CODE

selected_features = [
    'ADDR_PCT_CD',    # Precinct where occoured
    'BORO_NM',        # borough location
    'CMPLNT_FR_DT',   # Date of crime
    'CMPLNT_FR_TM',   # Time of crime
    'OFNS_DESC',      # Crime type description
    'LAW_CAT_CD',     # Law category (Felony/Misdemeanor/Violation)
    'STATION_NAME',   # Station name
    'Latitude',       # Latitude of incident
    'Longitude',      # Longitude of incident
]

df = df[selected_features]
df.head()

Unnamed: 0,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,OFNS_DESC,LAW_CAT_CD,STATION_NAME,Latitude,Longitude
0,47,BRONX,07/02/2024,19:20:00,SEX CRIMES,FELONY,(null),40.887314,-73.847272
1,61,BROOKLYN,06/01/2024,02:40:00,MURDER & NON-NEGL. MANSLAUGHTER,FELONY,(null),,
2,123,STATEN ISLAND,03/08/2024,21:20:00,HARRASSMENT 2,VIOLATION,(null),40.559481,-74.178539
3,120,STATEN ISLAND,04/04/2024,19:50:00,ROBBERY,FELONY,(null),40.634036,-74.123607
4,123,STATEN ISLAND,06/05/2024,15:00:00,ASSAULT 3 & RELATED OFFENSES,MISDEMEANOR,(null),40.536852,-74.213994


In [73]:
# DROPPING NaN values for all columns
df.dropna(subset=[
    'ADDR_PCT_CD',    # Precinct where occoured
    'BORO_NM',        # borough location
    'CMPLNT_FR_DT',   # Date of crime
    'CMPLNT_FR_TM',   # Time of crime
    'OFNS_DESC',      # Crime type description
    'LAW_CAT_CD',     # Law category (Felony/Misdemeanor/Violation)
    'STATION_NAME',   # Station name
    'Latitude',       # Latitude of incident
    'Longitude'], inplace=True)

df.fillna({'OFNS_DESC': 'Unknown'}, inplace=True)

# Had to do this because null values are string literals in this dataset
df = df[df['BORO_NM'] != '(null)']
df = df[df['Latitude'].notna()]
df = df[df['Longitude'].notna()]

In [74]:
# change date columns to YYYY-MM-DD
df['CMPLNT_FR_DT']= pd.to_datetime(df['CMPLNT_FR_DT'],errors='coerce')
df['CMPLNT_FR_TM'] = pd.to_datetime(df['CMPLNT_FR_TM'],errors='coerce')

# split date to year, month and day
df['year'] = df['CMPLNT_FR_DT'].dt.year
df['month'] = df['CMPLNT_FR_DT'].dt.month
df['day'] = df['CMPLNT_FR_DT'].dt.day
df['hour'] = df['CMPLNT_FR_TM'].dt.hour

# add week day column
df['weekday'] = pd.to_datetime(df['CMPLNT_FR_DT']).dt.day_name()

# We possibly want to drop them after extracting their features
# df.drop(columns=['CMPLNT_FR_DT','CMPLNT_FR_TM'],inplace=True)

  df['CMPLNT_FR_TM'] = pd.to_datetime(df['CMPLNT_FR_TM'],errors='coerce')


In [75]:
# We unfortunately wont be able to use the STATION_NAME parameter since it has mostly null values
df.STATION_NAME.value_counts()
df.drop(columns='STATION_NAME', inplace=True)

In [76]:
# encode crime severity || one hot encoding
df['LAW_CAT_CD'] = df['LAW_CAT_CD'].replace(['felony','misdemeanor','violation'],[2,1,0])

# Unsure if we should do this for the boroughs too, Lets try it out regardless .
# 1 - BROOKLYN, 2 - MANHATTAN, 3 - QUEENS, 4 - BRONX, 5 - STATEN ISLAND
df['BORO_NM'] =  df['BORO_NM'].replace(['BROOKLYN','MANHATTAN', 'QUEENS', 'BRONX', 'STATEN ISLAND'],[1,2,3,4,5])
# df['BORO_NM'].value_counts()

  df['BORO_NM'] =  df['BORO_NM'].replace(['BROOKLYN','MANHATTAN', 'QUEENS', 'BRONX', 'STATEN ISLAND'],[1,2,3,4,5])


In [77]:
df.head(5)

Unnamed: 0,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,OFNS_DESC,LAW_CAT_CD,Latitude,Longitude,year,month,day,hour,weekday
0,47,4,2024-07-02,2024-12-03 19:20:00,SEX CRIMES,FELONY,40.887314,-73.847272,2024.0,7.0,2.0,19,Tuesday
2,123,5,2024-03-08,2024-12-03 21:20:00,HARRASSMENT 2,VIOLATION,40.559481,-74.178539,2024.0,3.0,8.0,21,Friday
3,120,5,2024-04-04,2024-12-03 19:50:00,ROBBERY,FELONY,40.634036,-74.123607,2024.0,4.0,4.0,19,Thursday
4,123,5,2024-06-05,2024-12-03 15:00:00,ASSAULT 3 & RELATED OFFENSES,MISDEMEANOR,40.536852,-74.213994,2024.0,6.0,5.0,15,Wednesday
5,123,5,2024-08-09,2024-12-03 12:00:00,GRAND LARCENY,FELONY,40.54256,-74.216738,2024.0,8.0,9.0,12,Friday


In [78]:
# SOME Sanity Checking for each columns before we start with modelling

# we know there are 77 precincts in NYC. Looks good
# df.ADDR_PCT_CD.unique()
# df.ADDR_PCT_CD.value_counts()
# print("Precincts:" , df.ADDR_PCT_CD.nunique())

#BORO_NM - 1 - BROOKLYN, 2 - MANHATTAN, 3 - QUEENS, 4 - BRONX, 5 - STATEN ISLAND
# print(df.dtypes)

# OFFENSE DESCRIPTION
# Looks like some null values, lets drop them
df = df[df['OFNS_DESC'] != '(null)']
# df.OFNS_DESC.value_counts()

# LAW_CAT_DT
# df.LAW_CAT_CD.value_counts()

# Latitude - std dv = 0.11 - will need to normalize
# df.Latitude.describe()

#dropping cus we already extracted the right features from here
df.drop(columns=['CMPLNT_FR_DT', 'CMPLNT_FR_TM'], inplace=True)

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['CMPLNT_FR_DT', 'CMPLNT_FR_TM'], inplace=True)


Unnamed: 0,ADDR_PCT_CD,BORO_NM,OFNS_DESC,LAW_CAT_CD,Latitude,Longitude,year,month,day,hour,weekday
0,47,4,SEX CRIMES,FELONY,40.887314,-73.847272,2024.0,7.0,2.0,19,Tuesday
2,123,5,HARRASSMENT 2,VIOLATION,40.559481,-74.178539,2024.0,3.0,8.0,21,Friday
3,120,5,ROBBERY,FELONY,40.634036,-74.123607,2024.0,4.0,4.0,19,Thursday
4,123,5,ASSAULT 3 & RELATED OFFENSES,MISDEMEANOR,40.536852,-74.213994,2024.0,6.0,5.0,15,Wednesday
5,123,5,GRAND LARCENY,FELONY,40.542560,-74.216738,2024.0,8.0,9.0,12,Friday
...,...,...,...,...,...,...,...,...,...,...,...
433276,105,3,PETIT LARCENY,MISDEMEANOR,40.699606,-73.738811,2023.0,11.0,25.0,12,Saturday
433277,105,3,OFF. AGNST PUB ORD SENSBLTY &,MISDEMEANOR,40.719221,-73.742240,2023.0,8.0,26.0,6,Saturday
433278,105,3,ASSAULT 3 & RELATED OFFENSES,MISDEMEANOR,40.718502,-73.735259,2024.0,8.0,19.0,20,Monday
433279,105,3,VEHICLE AND TRAFFIC LAWS,MISDEMEANOR,40.733461,-73.735456,2024.0,5.0,28.0,16,Tuesday


In [79]:
# column_titles = ['date','borough','month','time','Latitude', 'Longitude', 'description',
#                  'OFNS_DESC', 'ADDR_PCT_CD', 'CRIME_CLASS', 'BORO_NM']
# df = df.reindex(columns = column_titles)

In [80]:
df.to_csv('processed_complaints.csv', index=False)

# Modelling using ML algorithms


In [123]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load dataset
df = pd.read_csv('./processed_complaints.csv')

# Should we try clustering the coordinates for imporved guessing?

# Initialize the scaler
scaler = StandardScaler()

# Select latitude and longitude
df[['Latitude', 'Longitude']] = scaler.fit_transform(df[['Latitude', 'Longitude']])

## Problem - Add new features - k means clustered coordinates or hour * weekday column to explore those patterns
## Should we also try using K-means to cluster the locations?

# Encode categorical variables
label_encoder = LabelEncoder()
df['BORO_NM'] = label_encoder.fit_transform(df['BORO_NM'])
df['OFNS_DESC'] = label_encoder.fit_transform(df['OFNS_DESC'])
df['LAW_CAT_CD'] = label_encoder.fit_transform(df['LAW_CAT_CD'])
df['day'] = label_encoder.fit_transform(df['day'])
df['month'] = label_encoder.fit_transform(df['month'])
# df['year'] = label_encoder.fit_transform(df['year'])
df['weekday'] = label_encoder.fit_transform(df['weekday'])

# Define features and target
features = ['ADDR_PCT_CD', 'OFNS_DESC', 'LAW_CAT_CD', 'Latitude', 'Longitude', 'month', 'day', 'hour', 'weekday']
df['crime_score'] = np.random.rand(len(df))  # Placeholder for actual crime score calculation
X = df[features]
y = df['crime_score']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


## Random Forest results


In [124]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predict on the test set
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

# Checking feature importance
feature_importances = model.feature_importances_
for feature, importance in zip(features, feature_importances):
    print(f"{feature}: {importance}")


Mean Absolute Error (MAE): 0.2548688668266181
Mean Squared Error (MSE): 0.08836270212090808
Root Mean Squared Error (RMSE): 0.29725864515755984
R-squared (R2): -0.0635742952906726
ADDR_PCT_CD: 0.038484647119847025
OFNS_DESC: 0.08400448745324084
LAW_CAT_CD: 0.02550099584698841
Latitude: 0.21519557698279237
Longitude: 0.21792629952175097
month: 0.08468965104389588
day: 0.13224413367827595
hour: 0.13353271778760353
weekday: 0.06842149056560504


# XGBoost results


In [126]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

xgb_model = XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)


# Predict on the test set
y_pred = xgb_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

# Checking feature importance
feature_importances = xgb_model.feature_importances_
for feature, importance in zip(features, feature_importances):
    print(f"{feature}: {importance}")


Mean Absolute Error (MAE): 0.24968163025448914
Mean Squared Error (MSE): 0.08322496689229011
Root Mean Squared Error (RMSE): 0.2884873773534816
R-squared (R2): -0.0017341410852200756
ADDR_PCT_CD: 0.10908015072345734
OFNS_DESC: 0.10879456996917725
LAW_CAT_CD: 0.1070837453007698
Latitude: 0.11388332396745682
Longitude: 0.11502178758382797
month: 0.11083237081766129
day: 0.11449217051267624
hour: 0.11304233223199844
weekday: 0.10776948928833008


In [130]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Create a LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Set parameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'rmse',  # Root Mean Squared Error
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,  # You can adjust this
    'max_depth': -1,   # No limit
    'verbose': -1
}

# Train LightGBM model
lgb_model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],
    valid_names=['training', 'validation'],  # Naming datasets
    num_boost_round=1000,  # Maximum boosting iterations
)

# Predict on test set
y_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)


mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

# Checking feature importance
# feature_importances = lgb_model.feature_importances_
# for feature, importance in zip(features, feature_importances):
    # print(f"{feature}: {importance}")


Mean Absolute Error (MAE): 0.24992100418444665
Mean Squared Error (MSE): 0.08343729202429
Root Mean Squared Error (RMSE): 0.28885514020749226
R-squared (R2): -0.004289784441737998


AttributeError: 'Booster' object has no attribute 'feature_importances_'

In [131]:
df

Unnamed: 0,ADDR_PCT_CD,BORO_NM,OFNS_DESC,LAW_CAT_CD,Latitude,Longitude,year,month,day,hour,weekday,crime_score
0,47,3,51,0,1.256367,0.426161,2024.0,6,1,19,5,0.686140
1,123,4,24,2,-1.475940,-1.448304,2024.0,2,7,21,0,0.721734
2,120,4,50,0,-0.854565,-1.137473,2024.0,3,3,19,4,0.592966
3,123,4,4,1,-1.664541,-1.648924,2024.0,5,4,15,6,0.656711
4,123,4,22,0,-1.616968,-1.664451,2024.0,7,8,12,0,0.842490
...,...,...,...,...,...,...,...,...,...,...,...,...
432788,105,2,45,1,-0.308074,1.039883,2023.0,10,24,12,2,0.137381
432789,105,2,35,1,-0.144594,1.020480,2023.0,7,25,6,2,0.774802
432790,105,2,4,1,-0.150588,1.059979,2024.0,7,18,20,1,0.684209
432791,105,2,55,1,-0.025909,1.058866,2024.0,4,27,16,5,0.999038


In [132]:
df.crime_score.describe()

Unnamed: 0,crime_score
count,432793.0
mean,0.499905
std,0.288768
min,3e-06
25%,0.250034
50%,0.499834
75%,0.749661
max,1.0


In [None]:
# A function I created for letting users input their values and giving them back a crime prediction score. 
# We will then Dump this model using jololib or pickle to use in our streamlit app 

# def predict_crime_score(addr_pct_cd, boro_nm, ofns_desc, law_cat_cd, latitude, longitude, model, label_encoders):

#     # Create a single-row DataFrame with the input data
#     input_data = pd.DataFrame({
#         'ADDR_PCT_CD': [addr_pct_cd],
#         'BORO_NM': [boro_nm],
#         'OFNS_DESC': [ofns_desc],
#         'LAW_CAT_CD': [law_cat_cd],
#         'Latitude': [latitude],
#         'Longitude': [longitude]
#     })

#     # Apply label encoding to categorical columns
#     for col, encoder in label_encoders.items():
#         if col in input_data.columns:
#             input_data[col] = encoder.transform(input_data[col])

#     # Define the full feature list
#     features = ['ADDR_PCT_CD', 'BORO_NM', 'OFNS_DESC', 'LAW_CAT_CD', 'Latitude', 'Longitude']

#     # Ensure the model has all required features
#     if not all(feature in input_data.columns for feature in features):
#         raise ValueError(f"Missing one or more required features in the input data: {features}")

#     # Extract the features for prediction
#     X_input = input_data[features]

#     # Predict the crime score
#     crime_score = model.predict(X_input)[0]  # Get the single prediction
#     return crime_score

# # Fit LabelEncoders for all categorical columns using the training dataset
# label_encoders = {
#     'BORO_NM': LabelEncoder().fit(df['BORO_NM']),
#     'OFNS_DESC': LabelEncoder().fit(df['OFNS_DESC']),
#     'LAW_CAT_CD': LabelEncoder().fit(df['LAW_CAT_CD'])
# }

# # Example inputs for prediction
# addr_pct_cd = 1
# boro_nm = "4"
# ofns_desc = "1"
# law_cat_cd = "0"
# latitude = 40.7128
# longitude = -74.0060

# # Predict the crime score
# try:
#     crime_score = predict_crime_score(addr_pct_cd, boro_nm, ofns_desc, law_cat_cd, latitude, longitude, model, label_encoders)
#     print(f"Predicted Crime Score: {crime_score}")
# except ValueError as e:
#     print("Error:", e)
