Gather event data from Drive

In [10]:
#get merged event data
import pandas as pd
df_daily = pd.read_csv("df_final.csv", index_col = 0)
df_daily.head()

Unnamed: 0,description,date,Human voice - Shouting,Human voice - Singing,Music non-amplified,Nature elements - Wind,Transport road - Passenger car,Transport road - Siren,Unsupported,latitude,longitude
0,MP 01: Naamsestraat 35 Maxim,2022-03-07,10,0,0,0,4,1,0,50.877161,4.700699
1,MP 01: Naamsestraat 35 Maxim,2022-03-08,39,0,1,0,20,3,0,50.877161,4.700699
2,MP 01: Naamsestraat 35 Maxim,2022-03-09,28,0,0,0,42,0,0,50.877161,4.700699
3,MP 01: Naamsestraat 35 Maxim,2022-03-10,39,0,0,0,27,0,0,50.877161,4.700699
4,MP 01: Naamsestraat 35 Maxim,2022-03-11,70,0,0,0,44,3,0,50.877161,4.700699


Clean up data

In [8]:
import numpy as np
from datetime import timedelta

def add_end(ts):
  upper = pd.DataFrame(ts.iloc[0:1,:].replace([0,1], np.nan))   
  upper['begin_date'] = pd.to_datetime('2023-01-01 00:00:00')
  return pd.concat([ts, upper])

def missing_or_noevent(ts, cutoff = 2, cutoff_Vrijthof = 7):
  '''
  This function fills some of the nans in the noise event variables with zeroes, based on the specified cutoff values. 
  The argument 'cutoff_Vrijthof' is used to specify the maximum length of a time period with missing values (in days!)
  before it is considered truly missing for MP08bis, 'cutoff' does the same for all other MPs.
  It returns two DataFrames: the adapted input DataFrame, and a DataFrame with all the missing time periods. 
  The latter is not filtered by the specified cutoffs. 
  'Unsupported' category is assumed to consist of unclassifiable events and is thus treated as an event, not missing.
  '''

  # Get time series sampling frequency
  ts['timestamp'] = pd.to_datetime(ts['timestamp'])
  ts = ts.sort_values(['location_csv','timestamp']).reset_index(drop=True)
  freq = pd.to_timedelta(ts.loc[1, 'timestamp'] - ts.loc[0, 'timestamp'])

  # Construct a df with the missing time periods
  missing_time = pd.DataFrame()
  df = ts.dropna(subset = 'human_noise')
  missing_time[['location_csv', 'begin_date']] = df[['location_csv', 'timestamp']]
  missing_time = missing_time.groupby('location_csv').apply(add_end).reset_index(drop=True)
  missing_time = missing_time.sort_values(["location_csv","begin_date"]).reset_index(drop=True)
  missing_time['end_date'] = missing_time['begin_date'].shift(-1)
  missing_time['begin_date'] = missing_time['begin_date'] + freq
  missing_time['timedelta'] = missing_time.groupby("location_csv")["begin_date"].diff().shift(-1)
  missing_time = missing_time.dropna()
  missing_time = missing_time[missing_time['timedelta'] > freq]
  missing_time['timedelta'] = missing_time['timedelta'] - freq

  # Filter by cutoff values
  true_na_vh = missing_time.loc[(missing_time['timedelta'] > timedelta(days = cutoff_Vrijthof)) & (missing_time['location_csv'] == '280324_mp08bis---vrijthof.csv')]
  true_na_other = missing_time.loc[(missing_time['timedelta'] > timedelta(days = cutoff)) & (missing_time['location_csv'] != '280324_mp08bis---vrijthof.csv')]


  col_to_fill = ts.columns[ts.columns.str.contains('noise')].values.tolist()

  ts_vh = ts.loc[ts['location_csv'] == '280324_mp08bis---vrijthof.csv' ].copy()
  ts_other = ts.loc[ts['location_csv'] != '280324_mp08bis---vrijthof.csv'].copy()

  
  # Add column true_na yes/no (less intervals to check for true nans than false nans)
  # If timestamp not in true_na, replace any nans with 0s

    #Vrijthof
  ts_vh['true_na'] = ts_vh['timestamp'].apply(lambda t: any((true_na_vh["begin_date"] <= t) & (true_na_vh["end_date"] > t)))
  ts_vh.loc[ts_vh['true_na'] == 0, col_to_fill] = ts_vh.loc[ts_vh['true_na'] == 0, col_to_fill].fillna(0)

    #other MPs
  other = ts_other['location_csv'].drop_duplicates().tolist()
  for MP in other:
    ts_other.loc[ts_other['location_csv'] == MP,'true_na'] = ts_other.loc[ts_other['location_csv'] == MP, 'timestamp'] \
                          .apply(lambda t: any((true_na_other['location_csv'] == MP) &(true_na_other["begin_date"] <= t) & (true_na_other["end_date"] > t)))
  ts_other.loc[ts_other['true_na'] == 0, col_to_fill] = ts_other.loc[ts_other['true_na'] == 0, col_to_fill].fillna(0)

  df = pd.concat([ts_vh, ts_other]).drop('true_na', axis = 1)  

  return df, missing_time

In [9]:
#Clean up
df, missing = missing_or_noevent(df_hourly)
df.head()
df.to_csv('bs.csv')

KeyError: 'timestamp'

## Build a model to classify police activity
First starts with obtaining the trends data, then the cleaned data is used, merged with trends data. Trends is perhaps not the best way to approach this but access to police complaints data hourly was not possible. 

In [11]:
from pytrends.request import TrendReq
import pandas as pd
import numpy as np


def get_trends(keyword):
    pytrends = TrendReq(hl='en-US', tz=360)
    pytrends.build_payload([keyword], cat=0, timeframe='2022-01-01 2022-12-31', geo='BE', gprop='')
    trends = pytrends.interest_over_time()
    return trends

  # get google trends data
trends_df = get_trends('leuven politie')
#To see if there is a relation between human noise, and police search
#Potentially we could use different features and different keyword for search

# Create a new dataframe with a continuous date range to create NaN values 
all_dates_df = pd.DataFrame(index=pd.date_range(start='2022-01-01', end='2022-12-31'))

# Merge the new dataframe with the original trends_df
trends_df = all_dates_df.merge(trends_df, left_index=True, right_index=True, how='left')

#index is date in this data
trends_df = trends_df.reset_index()

if 'index' in trends_df.columns:
    # Strip leading/trailing whitespaces and rename the column
    trends_df.rename(columns={'index': 'timestamp'.strip()}, inplace=True)
else:
    print("The column 'index' does not exist in the DataFrame.")
    
trends_df.head()

Unnamed: 0,timestamp,leuven politie,isPartial
0,2022-01-01,,
1,2022-01-02,26.0,False
2,2022-01-03,,
3,2022-01-04,,
4,2022-01-05,,


In [12]:
# resample to daily frequency, summing up the noise events
#in order to make use of it with google trends data
df_daily = df.set_index('timestamp').resample('D').sum().reset_index()

# merge dataframes on timestamp
merged_df = pd.merge(df_daily, trends_df, on='timestamp')
merged_df.head()

  df_daily = df.set_index('timestamp').resample('D').sum().reset_index()


Unnamed: 0,timestamp,human_noise,noise_event_human_voice_-_shouting,noise_event_human_voice_-_singing,noise_event_music_non-amplified,noise_event_nature_elements_-_wind,noise_event_transport_road_-_passenger_car,noise_event_transport_road_-_siren,noise_event_unsupported,#object_id,...,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,LC_RAD60,LC_TEMP_QCL0,LC_TEMP_QCL1,LC_TEMP_QCL2,LC_TEMP_QCL3,leuven politie,isPartial
0,2022-01-07,2.0,2.0,0.0,0.0,0.0,4.0,0.0,0.0,2043528,...,0.0295,32.333333,0.058333,1.0,27.443333,27.443333,27.163333,26.718334,,
1,2022-01-08,2.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,6130584,...,0.039833,507.166667,10.84,95.0,97.465,97.465,96.625,97.593354,,
2,2022-01-09,2.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,6130584,...,0.028667,617.0,20.698333,128.333333,117.963333,117.963333,117.123333,117.262638,48.0,False
3,2022-01-10,4.0,4.0,0.0,0.0,0.0,5.0,0.0,0.0,6130584,...,0.014,-49.166667,0.023333,137.5,14.731667,14.683333,13.823,13.493534,,
4,2022-01-11,3.0,3.0,0.0,0.0,0.0,6.0,0.0,0.0,6130584,...,0.0,6.0,0.001667,165.833333,57.116667,57.116667,56.276667,56.902138,,


In [13]:
#add weekend and weekday
merged_df['day_of_week'] = merged_df['timestamp'].dt.dayofweek
merged_df['hour_of_day'] = merged_df['timestamp'].dt.hour
merged_df['is_weekend'] = merged_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
import holidays

be_holidays = holidays.Belgium()

#add holiday data
merged_df['is_holiday'] = merged_df['timestamp'].apply(lambda x: 1 if x in be_holidays else 0)


In [24]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
#from google.colab import files
import pandas as pd
import numpy as np
import pickle


# create a new feature that is the sum of the noise events
merged_df['human_noise'] = merged_df[['noise_event_human_voice_-_shouting', 'noise_event_human_voice_-_singing', 'noise_event_music_non-amplified']].sum(axis=1)

# select features and target
features = merged_df[['human_noise', 'day_of_week', 'is_weekend', 'is_holiday']]
target = merged_df['leuven politie']

# discretize the target variable into two categories: low, high
discretizer = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='quantile')
imputer = SimpleImputer(strategy='mean')  
target = imputer.fit_transform(target.values.reshape(-1, 1))
#target is now numpy
target = discretizer.fit_transform(target.reshape(-1, 1))
# split data into training and testing sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2)

#pipeline with imputer
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=1000))
])

pipeline.fit(features_train, target_train.ravel())
predictions = pipeline.predict(features_test)

accuracy = accuracy_score(target_test, predictions)
report = classification_report(target_test, predictions, target_names=['low','high'])

print(f'Accuracy: {accuracy}')
print(f'Classification Report: \n{report}')

roc_auc = roc_auc_score(target_test, predictions)
print(f'ROC AUC: {roc_auc}') #how good it is at classifying

# Save the trained model as a pickle string.
saved_model = pickle.dumps(pipeline)

# Save the model to disk
filename = 'classifier_trends_daily.pkl'
pickle.dump(pipeline, open(filename, 'wb'))

print("Model saved as " + filename)
#files.download('classifier_trends_hour.pkl')

Accuracy: 0.9444444444444444
Classification Report: 
              precision    recall  f1-score   support

         low       0.83      0.62      0.71         8
        high       0.95      0.98      0.97        64

    accuracy                           0.94        72
   macro avg       0.89      0.80      0.84        72
weighted avg       0.94      0.94      0.94        72

ROC AUC: 0.8046875
Model saved as classifier_trends_daily.pkl
