Gather event data from Drive

In [26]:
#get merged event data
import pandas as pd
df_daily = pd.read_csv("df_final.csv", index_col = 0)
df_daily['date'] = pd.to_datetime(df_daily['date'])
df_daily.head()



Unnamed: 0,description,date,Human voice - Shouting,Human voice - Singing,Music non-amplified,Nature elements - Wind,Transport road - Passenger car,Transport road - Siren,Unsupported,latitude,longitude
0,MP 01: Naamsestraat 35 Maxim,2022-03-07,10,0,0,0,4,1,0,50.877161,4.700699
1,MP 01: Naamsestraat 35 Maxim,2022-03-08,39,0,1,0,20,3,0,50.877161,4.700699
2,MP 01: Naamsestraat 35 Maxim,2022-03-09,28,0,0,0,42,0,0,50.877161,4.700699
3,MP 01: Naamsestraat 35 Maxim,2022-03-10,39,0,0,0,27,0,0,50.877161,4.700699
4,MP 01: Naamsestraat 35 Maxim,2022-03-11,70,0,0,0,44,3,0,50.877161,4.700699


Clean up data

## Build a model to classify police activity
First starts with obtaining the trends data, then the cleaned data is used, merged with trends data. Trends is perhaps not the best way to approach this but access to police complaints data hourly was not possible. 

In [27]:
from pytrends.request import TrendReq
import pandas as pd
import numpy as np


def get_trends(keyword):
    pytrends = TrendReq(hl='en-US', tz=360)
    pytrends.build_payload([keyword], cat=0, timeframe='2022-01-01 2022-12-31', geo='BE', gprop='')
    trends = pytrends.interest_over_time()
    return trends

  # get google trends data
trends_df = get_trends('leuven politie')
#To see if there is a relation between human noise, and police search
#Potentially we could use different features and different keyword for search

# Create a new dataframe with a continuous date range to create NaN values 
all_dates_df = pd.DataFrame(index=pd.date_range(start='2022-01-01', end='2022-12-31'))

# Merge the new dataframe with the original trends_df
trends_df = all_dates_df.merge(trends_df, left_index=True, right_index=True, how='left')

#index is date in this data
trends_df = trends_df.reset_index()

if 'index' in trends_df.columns:
    # Strip leading/trailing whitespaces and rename the column
    trends_df.rename(columns={'index': 'date'.strip()}, inplace=True)
else:
    print("The column 'index' does not exist in the DataFrame.")
    
trends_df.head()

Unnamed: 0,date,leuven politie,isPartial
0,2022-01-01,,
1,2022-01-02,26.0,False
2,2022-01-03,,
3,2022-01-04,,
4,2022-01-05,,


In [28]:
# merge dataframes on timestamp
merged_df = pd.merge(df_daily, trends_df, on='date')
merged_df.head()

Unnamed: 0,description,date,Human voice - Shouting,Human voice - Singing,Music non-amplified,Nature elements - Wind,Transport road - Passenger car,Transport road - Siren,Unsupported,latitude,longitude,leuven politie,isPartial
0,MP 01: Naamsestraat 35 Maxim,2022-03-07,10,0,0,0,4,1,0,50.877161,4.700699,,
1,MP 02: Naamsestraat 57 Xior,2022-03-07,0,0,0,0,1,0,0,50.876491,4.700692,,
2,MP 03: Naamsestraat 62 Taste,2022-03-07,7,0,0,0,13,2,0,50.875851,4.700192,,
3,MP 05: Calvariekapel KU Leuven,2022-03-07,5,0,0,0,26,2,0,50.874487,4.69989,,
4,MP 06: Parkstraat 2 La Filosovia,2022-03-07,0,0,0,0,24,1,0,50.874091,4.700018,,


In [29]:
#add weekend and weekday
merged_df['day_of_week'] = merged_df['date'].dt.dayofweek
merged_df['hour_of_day'] = merged_df['date'].dt.hour
merged_df['is_weekend'] = merged_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
import holidays

be_holidays = holidays.Belgium()

#add holiday data
merged_df['is_holiday'] = merged_df['date'].apply(lambda x: 1 if x in be_holidays else 0)

# create a new feature that is the sum of the noise events
merged_df['human_noise'] = merged_df[['Human voice - Shouting', 'Human voice - Singing']].sum(axis=1)


merged_df.head()

Unnamed: 0,description,date,Human voice - Shouting,Human voice - Singing,Music non-amplified,Nature elements - Wind,Transport road - Passenger car,Transport road - Siren,Unsupported,latitude,longitude,leuven politie,isPartial,day_of_week,hour_of_day,is_weekend,is_holiday,human_noise
0,MP 01: Naamsestraat 35 Maxim,2022-03-07,10,0,0,0,4,1,0,50.877161,4.700699,,,0,0,0,0,10
1,MP 02: Naamsestraat 57 Xior,2022-03-07,0,0,0,0,1,0,0,50.876491,4.700692,,,0,0,0,0,0
2,MP 03: Naamsestraat 62 Taste,2022-03-07,7,0,0,0,13,2,0,50.875851,4.700192,,,0,0,0,0,7
3,MP 05: Calvariekapel KU Leuven,2022-03-07,5,0,0,0,26,2,0,50.874487,4.69989,,,0,0,0,0,5
4,MP 06: Parkstraat 2 La Filosovia,2022-03-07,0,0,0,0,24,1,0,50.874091,4.700018,,,0,0,0,0,0


In [36]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
#from google.colab import files
import pandas as pd
import numpy as np
import pickle

# select features and target
features = merged_df[['human_noise', 'day_of_week', 'is_weekend', 'is_holiday']]
target = merged_df['leuven politie']


# discretize the target variable into two categories: low, high
discretizer = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='quantile')
imputer = SimpleImputer(strategy='mean')  
target = imputer.fit_transform(target.values.reshape(-1, 1))
#target is now numpy
target = discretizer.fit_transform(target.reshape(-1, 1))
# split data into training and testing sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2)

#pipeline with imputer
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=1000))
])

pipeline.fit(features_train, target_train.ravel())
predictions = pipeline.predict(features_test)

accuracy = accuracy_score(target_test, predictions)
report = classification_report(target_test, predictions, target_names=['low','high'])

print(f'Accuracy: {accuracy}')
print(f'Classification Report: \n{report}')

roc_auc = roc_auc_score(target_test, predictions)
print(f'ROC AUC: {roc_auc}') #how good it is at classifying

# Save the trained model as a pickle string.
saved_model = pickle.dumps(pipeline)

# Save the model to disk
filename = 'classifier_trends_daily.pkl'
pickle.dump(pipeline, open(filename, 'wb'))

print("Model saved as " + filename)
#files.download('classifier_trends_hour.pkl')

Accuracy: 0.9278074866310161
Classification Report: 
              precision    recall  f1-score   support

         low       0.51      0.79      0.62        28
        high       0.98      0.94      0.96       346

    accuracy                           0.93       374
   macro avg       0.75      0.86      0.79       374
weighted avg       0.95      0.93      0.93       374

ROC AUC: 0.8625103220478944
Model saved as classifier_trends_daily.pkl
