Gather event data from Drive

In [37]:
#get merged event-weather df with hourly observations per location
import pandas as pd
df_hourly = pd.read_csv("merge_export41_meteoLC102_hourly.csv", index_col = 0)
df_hourly.head()

Unnamed: 0,location_csv,timestamp,human_noise,noise_event_human_voice_-_shouting,noise_event_human_voice_-_singing,noise_event_music_non-amplified,noise_event_nature_elements_-_wind,noise_event_transport_road_-_passenger_car,noise_event_transport_road_-_siren,noise_event_unsupported,...,LC_RAD,LC_RAININ,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,LC_RAD60,LC_TEMP_QCL0,LC_TEMP_QCL1,LC_TEMP_QCL2,LC_TEMP_QCL3
0,255439_mp-01-naamsestraat-35-maxim.csv,2022-03-07 16:00:00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,13.833333,0.0,0.0,-161.5,2.215,20.666667,6.671667,6.671667,6.381667,6.2079
1,255441_mp-03-naamsestraat-62-taste.csv,2022-03-07 16:00:00,,,,,,,,,...,13.833333,0.0,0.0,-161.5,2.215,20.666667,6.671667,6.671667,6.381667,6.2079
2,255442_mp-05-calvariekapel-ku-leuven.csv,2022-03-07 16:00:00,,,,,,,,,...,13.833333,0.0,0.0,-161.5,2.215,20.666667,6.671667,6.671667,6.381667,6.2079
3,255443_mp-06-parkstraat-2-la-filosovia.csv,2022-03-07 16:00:00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,13.833333,0.0,0.0,-161.5,2.215,20.666667,6.671667,6.671667,6.381667,6.2079
4,255444_mp-07-naamsestraat-81.csv,2022-03-07 16:00:00,,,,,,,,,...,13.833333,0.0,0.0,-161.5,2.215,20.666667,6.671667,6.671667,6.381667,6.2079


Clean up data

In [38]:
import numpy as np
from datetime import timedelta

def add_end(ts):
  upper = pd.DataFrame(ts.iloc[0:1,:].replace([0,1], np.nan))   
  upper['begin_date'] = pd.to_datetime('2023-01-01 00:00:00')
  return pd.concat([ts, upper])

def missing_or_noevent(ts, cutoff = 2, cutoff_Vrijthof = 7):
  '''
  This function fills some of the nans in the noise event variables with zeroes, based on the specified cutoff values. 
  The argument 'cutoff_Vrijthof' is used to specify the maximum length of a time period with missing values (in days!)
  before it is considered truly missing for MP08bis, 'cutoff' does the same for all other MPs.
  It returns two DataFrames: the adapted input DataFrame, and a DataFrame with all the missing time periods. 
  The latter is not filtered by the specified cutoffs. 
  'Unsupported' category is assumed to consist of unclassifiable events and is thus treated as an event, not missing.
  '''

  # Get time series sampling frequency
  ts['timestamp'] = pd.to_datetime(ts['timestamp'])
  ts = ts.sort_values(['location_csv','timestamp']).reset_index(drop=True)
  freq = pd.to_timedelta(ts.loc[1, 'timestamp'] - ts.loc[0, 'timestamp'])

  # Construct a df with the missing time periods
  missing_time = pd.DataFrame()
  df = ts.dropna(subset = 'human_noise')
  missing_time[['location_csv', 'begin_date']] = df[['location_csv', 'timestamp']]
  missing_time = missing_time.groupby('location_csv').apply(add_end).reset_index(drop=True)
  missing_time = missing_time.sort_values(["location_csv","begin_date"]).reset_index(drop=True)
  missing_time['end_date'] = missing_time['begin_date'].shift(-1)
  missing_time['begin_date'] = missing_time['begin_date'] + freq
  missing_time['timedelta'] = missing_time.groupby("location_csv")["begin_date"].diff().shift(-1)
  missing_time = missing_time.dropna()
  missing_time = missing_time[missing_time['timedelta'] > freq]
  missing_time['timedelta'] = missing_time['timedelta'] - freq

  # Filter by cutoff values
  true_na_vh = missing_time.loc[(missing_time['timedelta'] > timedelta(days = cutoff_Vrijthof)) & (missing_time['location_csv'] == '280324_mp08bis---vrijthof.csv')]
  true_na_other = missing_time.loc[(missing_time['timedelta'] > timedelta(days = cutoff)) & (missing_time['location_csv'] != '280324_mp08bis---vrijthof.csv')]


  col_to_fill = ts.columns[ts.columns.str.contains('noise')].values.tolist()

  ts_vh = ts.loc[ts['location_csv'] == '280324_mp08bis---vrijthof.csv' ].copy()
  ts_other = ts.loc[ts['location_csv'] != '280324_mp08bis---vrijthof.csv'].copy()

  
  # Add column true_na yes/no (less intervals to check for true nans than false nans)
  # If timestamp not in true_na, replace any nans with 0s

    #Vrijthof
  ts_vh['true_na'] = ts_vh['timestamp'].apply(lambda t: any((true_na_vh["begin_date"] <= t) & (true_na_vh["end_date"] > t)))
  ts_vh.loc[ts_vh['true_na'] == 0, col_to_fill] = ts_vh.loc[ts_vh['true_na'] == 0, col_to_fill].fillna(0)

    #other MPs
  other = ts_other['location_csv'].drop_duplicates().tolist()
  for MP in other:
    ts_other.loc[ts_other['location_csv'] == MP,'true_na'] = ts_other.loc[ts_other['location_csv'] == MP, 'timestamp'] \
                          .apply(lambda t: any((true_na_other['location_csv'] == MP) &(true_na_other["begin_date"] <= t) & (true_na_other["end_date"] > t)))
  ts_other.loc[ts_other['true_na'] == 0, col_to_fill] = ts_other.loc[ts_other['true_na'] == 0, col_to_fill].fillna(0)

  df = pd.concat([ts_vh, ts_other]).drop('true_na', axis = 1)  

  return df, missing_time

In [39]:
#Clean up
df, missing = missing_or_noevent(df_hourly)
df.head()

Unnamed: 0,location_csv,timestamp,human_noise,noise_event_human_voice_-_shouting,noise_event_human_voice_-_singing,noise_event_music_non-amplified,noise_event_nature_elements_-_wind,noise_event_transport_road_-_passenger_car,noise_event_transport_road_-_siren,noise_event_unsupported,...,LC_RAD,LC_RAININ,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,LC_RAD60,LC_TEMP_QCL0,LC_TEMP_QCL1,LC_TEMP_QCL2,LC_TEMP_QCL3
45765,280324_mp08bis---vrijthof.csv,2022-04-20 12:00:00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,277.833333,0.0,0.0,-160.833333,2.585,263.666667,16.583333,16.583333,16.293333,16.074125
45766,280324_mp08bis---vrijthof.csv,2022-04-20 13:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,291.166667,0.0,0.0,-161.166667,1.928333,299.166667,17.268333,17.268333,16.978333,16.646643
45767,280324_mp08bis---vrijthof.csv,2022-04-20 14:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,77.833333,0.0,0.0,-161.5,1.946667,211.166667,17.648333,17.648333,17.358333,16.948017
45768,280324_mp08bis---vrijthof.csv,2022-04-20 15:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,63.666667,0.0,0.0,-158.333333,2.19,70.0,17.763333,17.763333,17.473333,17.322063
45769,280324_mp08bis---vrijthof.csv,2022-04-20 16:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,55.666667,0.0,0.0,-161.666667,1.881667,62.833333,17.876667,17.876667,17.586667,17.449694


## Build a model to classify police activity
First starts with obtaining the trends data, then the cleaned data is used, merged with trends data. Trends is perhaps not the best way to approach this but access to police complaints data hourly was not possible. The hourly trends data is obtained by distributing the daily data randomly with normal distribution

In [40]:
import pytrends
from pytrends.request import TrendReq
import pandas as pd
import numpy as np
import scipy.stats as stats

def get_trends(keyword):
    pytrends = TrendReq(hl='en-US', tz=360)
    pytrends.build_payload([keyword], cat=0, timeframe='2022-01-01 2022-12-31', geo='BE', gprop='')
    trends = pytrends.interest_over_time()
    return trends

def split(n, m):
    if np.isnan(n) or n == 0:
        return np.repeat(0, m)
    mu = n / m
    sigma = mu / 3
    lst = stats.norm.rvs(mu, sigma, m)
    return np.round(lst * n / np.sum(lst))

def resample_to_hourly_with_normal_distribution(df, column):
    hourly_data = []
    for idx, row in df.iterrows():
        if np.isnan(row[column]):
            hourly_values = np.repeat(np.nan, 24)
        else:
            hourly_values = split(row[column], 24)
        timestamps = pd.date_range(idx, periods=24, freq='H')
        hourly_data.extend(zip(timestamps, hourly_values))
    hourly_df = pd.DataFrame(hourly_data, columns=['timestamp', column])
    hourly_df.set_index('timestamp', inplace=True)
    return hourly_df

# get google trends data
trends_df = get_trends('leuven politie')

# Create a new dataframe with a continuous date range to create NaN values 
all_dates_df = pd.DataFrame(index=pd.date_range(start='2022-01-01', end='2022-12-31'))

# Merge the new dataframe with the original trends_df
trends_df = all_dates_df.merge(trends_df, left_index=True, right_index=True, how='left')

# Reset index
trends_df = trends_df.reset_index().rename(columns={'index': 'timestamp'.strip()})
trends_df.set_index('timestamp', inplace=True)

# Resample data to hourly with normal distribution
trends_df_hourly_normal = resample_to_hourly_with_normal_distribution(trends_df, 'leuven politie')
#trends_df_hourly_normal.to_csv('ree.csv')


In [41]:
# merge dataframes on timestamp
merged_df = pd.merge(df_hourly, trends_df_hourly_normal, on='timestamp')
merged_df.head()

Unnamed: 0,location_csv,timestamp,human_noise,noise_event_human_voice_-_shouting,noise_event_human_voice_-_singing,noise_event_music_non-amplified,noise_event_nature_elements_-_wind,noise_event_transport_road_-_passenger_car,noise_event_transport_road_-_siren,noise_event_unsupported,...,LC_RAININ,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,LC_RAD60,LC_TEMP_QCL0,LC_TEMP_QCL1,LC_TEMP_QCL2,LC_TEMP_QCL3,leuven politie
0,255439_mp-01-naamsestraat-35-maxim.csv,2022-03-07 16:00:00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,-161.5,2.215,20.666667,6.671667,6.671667,6.381667,6.2079,
1,255441_mp-03-naamsestraat-62-taste.csv,2022-03-07 16:00:00,,,,,,,,,...,0.0,0.0,-161.5,2.215,20.666667,6.671667,6.671667,6.381667,6.2079,
2,255442_mp-05-calvariekapel-ku-leuven.csv,2022-03-07 16:00:00,,,,,,,,,...,0.0,0.0,-161.5,2.215,20.666667,6.671667,6.671667,6.381667,6.2079,
3,255443_mp-06-parkstraat-2-la-filosovia.csv,2022-03-07 16:00:00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,-161.5,2.215,20.666667,6.671667,6.671667,6.381667,6.2079,
4,255444_mp-07-naamsestraat-81.csv,2022-03-07 16:00:00,,,,,,,,,...,0.0,0.0,-161.5,2.215,20.666667,6.671667,6.671667,6.381667,6.2079,


In [42]:
#add weekend and weekday
merged_df['day_of_week'] = merged_df['timestamp'].dt.dayofweek
merged_df['hour_of_day'] = merged_df['timestamp'].dt.hour
merged_df['is_weekend'] = merged_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
import holidays

be_holidays = holidays.Belgium()

#add holiday data
merged_df['is_holiday'] = merged_df['timestamp'].apply(lambda x: 1 if x in be_holidays else 0)


In [43]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import pickle


# Create a new feature that is the sum of the noise events
merged_df['human_noise'] = merged_df[['noise_event_human_voice_-_shouting', 'noise_event_human_voice_-_singing', 'noise_event_music_non-amplified']].sum(axis=1)

# Select features and target
features = merged_df[['human_noise', 'day_of_week', 'is_weekend', 'is_holiday', 'hour_of_day']]
target = merged_df['leuven politie']

# Discretize the target variable into two categories: low, high
discretizer = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='quantile')
imputer = SimpleImputer(strategy='mean')
target = imputer.fit_transform(target.values.reshape(-1, 1))
# Target is now numpy
target = discretizer.fit_transform(target.reshape(-1, 1))

# Split data into training and testing sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2)

# Logistic Regression pipeline
lr_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

# Define hyperparameter space to search
param_grid = {
    'classifier__C': np.logspace(-4, 4, 20), # regularization parameter
    'classifier__penalty': ['l1', 'l2'] # penalty can be l1 or l2
}

# GridSearchCV instance
grid_search = GridSearchCV(lr_pipeline, param_grid, cv=5, scoring='roc_auc')

# Fit GridSearchCV
grid_search.fit(features_train, target_train.ravel())

# Print the best parameters and the corresponding ROC AUC score
print("Best parameters: ", grid_search.best_params_)
print("Best ROC AUC score: ", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions with the best model
predictions = best_model.predict(features_test)

# Compute the accuracy, classification report and ROC AUC
accuracy = accuracy_score(target_test, predictions)
report = classification_report(target_test, predictions, target_names=['low', 'high'])
roc_auc = roc_auc_score(target_test, predictions)

print(f'Accuracy: {accuracy}')
print(f'Classification Report: \n{report}')
print(f'ROC AUC: {roc_auc}')

# Save the model
filename = 'classifier_trend_hourly.pkl'
pickle.dump(best_model, open(filename, 'wb'))
print("Model saved as " + filename)


100 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\uygar\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\uygar\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\uygar\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  

Best parameters:  {'classifier__C': 29.763514416313132, 'classifier__penalty': 'l2'}
Best ROC AUC score:  0.9693419846848041
Accuracy: 0.9366991039599191
Classification Report: 
              precision    recall  f1-score   support

         low       0.57      0.61      0.59       774
        high       0.97      0.96      0.97      9605

    accuracy                           0.94     10379
   macro avg       0.77      0.79      0.78     10379
weighted avg       0.94      0.94      0.94     10379

ROC AUC: 0.7870235409798138
Model saved as classifier_trend_hourly.pkl
