In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [49]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
parking_df = pd.read_csv('../data/parking_df.csv')

In [8]:
parking_df.head()

Unnamed: 0,datetime_of_infraction,time_of_infraction,year,month,day,infraction_code,infraction_description,set_fine_amount,location2,province,latitude,longitude,permit_time_restrictions,fee_related,time_related,fire_route,accessible_related,commercial_related,obstruction_related,cycle_related
0,2016-12-30 16:37:00,16:37:00,2016,12,30,403.0,STOP-SIGNED HIGHWAY-RUSH HOUR,150,"1546 BLOOR ST W, TORONTO, ON, CANADA",ON,43.656337,-79.453142,0,0,0,0,0,0,0,0
1,2016-12-30 16:37:00,16:37:00,2016,12,30,403.0,STOP-SIGNED HIGHWAY-RUSH HOUR,150,"5418 YONGE ST, TORONTO, ON, CANADA",ON,43.775587,-79.414671,0,0,0,0,0,0,0,0
2,2016-12-30 16:37:00,16:37:00,2016,12,30,403.0,STOP-SIGNED HIGHWAY-RUSH HOUR,150,"777 QUEEN ST W, TORONTO, ON, CANADA",ON,43.646259,-79.40808,0,0,0,0,0,0,0,0
3,2016-12-30 16:37:00,16:37:00,2016,12,30,403.0,STOP-SIGNED HIGHWAY-RUSH HOUR,150,"747 QUEEN ST E, TORONTO, ON, CANADA",ON,43.659131,-79.34808,0,0,0,0,0,0,0,0
4,2016-12-30 16:37:00,16:37:00,2016,12,30,403.0,STOP-SIGNED HIGHWAY-RUSH HOUR,150,"3042 DUNDAS ST W, TORONTO, ON, CANADA",ON,43.665651,-79.470785,0,0,0,0,0,0,0,0


In [9]:
parking_coord = parking_df[parking_df['latitude'] != 0.0]

In [48]:
df_numerical_copy = parking_coord[['latitude', 'longitude', 'permit_time_restrictions', 'fee_related', 'time_related', 'fire_route', 'accessible_related', 'commercial_related', 'obstruction_related', 'cycle_related']]

X = df_numerical_copy[['latitude', 'longitude']]
y = df_numerical_copy.drop(columns=['latitude', 'longitude'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [26]:
y.columns

Index(['permit_time_restrictions', 'fee_related', 'time_related', 'fire_route',
       'accessible_related', 'commercial_related', 'obstruction_related',
       'cycle_related'],
      dtype='object')

In [24]:
models = {}

for column in y.columns:
    model = LogisticRegression()
    model.fit(X_train, y_train[column])
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test[column], y_pred)
    
    models[column] = {'model': model, 'accuracy': accuracy}

for infraction, model_info in models.items():
    print(f"Infraction: {infraction}, Accuracy: {model_info['accuracy']}")


Infraction: permit_time_restrictions, Accuracy: 0.7629070129829907
Infraction: fee_related, Accuracy: 0.7810603582275614
Infraction: time_related, Accuracy: 0.8876304445033769
Infraction: fire_route, Accuracy: 0.9780119589855962
Infraction: accessible_related, Accuracy: 0.9881319940611577
Infraction: commercial_related, Accuracy: 0.9838036852568434
Infraction: obstruction_related, Accuracy: 0.9914033902504651
Infraction: cycle_related, Accuracy: 0.9972202811077838


Reference: https://calmcode.io/course/scikit-meta/multi-output#:~:text=Using%20the%20MultiOutputClassifier&text=Instead%20of%20making%20two%20pipelines,this%20by%20using%20a%20MultiOutputClassifier.

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [66]:
classifier = MultiOutputClassifier(RandomForestClassifier(random_state=42))

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.63      0.69    122482
           1       0.69      0.58      0.63    113104
           2       0.77      0.67      0.72     58050
           3       0.73      0.15      0.25     11359
           4       0.65      0.53      0.58      6131
           5       0.69      0.47      0.56      8367
           6       0.74      0.03      0.06      4441
           7       0.62      0.07      0.12      1436

   micro avg       0.73      0.59      0.65    325370
   macro avg       0.71      0.39      0.45    325370
weighted avg       0.73      0.59      0.64    325370
 samples avg       0.30      0.30      0.30    325370



In [None]:
forest = RandomForestClassifier(random_state=42)
classifier = MultiOutputClassifier(forest)

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred))

put rf or clf the same variable

In [83]:
rf = RandomForestClassifier(random_state=42)
pipe = Pipeline(steps=[('clf', MultiOutputClassifier(rf))])
param_grid = {
    'clf__estimator__n_estimators': [100, 300],  #Number of trees in the forest
    'clf__estimator__max_depth': [2, 15, 30],  #Maximum depth of the trees
}


#fit
pipe.fit(X_train, y_train)

#score
pipe.score(X_test, y_test)

0.6879978474600222

In [84]:
search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_grid,
    scoring='accuracy',
    refit='precision_micro',
    cv = 3,
    verbose= 2
)

In [None]:
#fit
fittedgrid = search.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] END clf__estimator__max_depth=2, clf__estimator__n_estimators=100; total time= 2.7min
[CV] END clf__estimator__max_depth=2, clf__estimator__n_estimators=100; total time= 2.7min
[CV] END clf__estimator__max_depth=2, clf__estimator__n_estimators=100; total time= 2.7min
[CV] END clf__estimator__max_depth=2, clf__estimator__n_estimators=300; total time= 7.8min
[CV] END clf__estimator__max_depth=2, clf__estimator__n_estimators=300; total time= 7.8min
[CV] END clf__estimator__max_depth=2, clf__estimator__n_estimators=300; total time= 7.9min
[CV] END clf__estimator__max_depth=15, clf__estimator__n_estimators=100; total time= 7.9min
[CV] END clf__estimator__max_depth=15, clf__estimator__n_estimators=100; total time= 7.9min
[CV] END clf__estimator__max_depth=15, clf__estimator__n_estimators=100; total time= 7.9min
[CV] END clf__estimator__max_depth=15, clf__estimator__n_estimators=300; total time=23.7min
[CV] END clf__estimator__m

In [None]:
fittedgrid.score(X_train, y_test)

In [52]:
fittedgrid.best_estimator_

NameError: name 'fittedgrid' is not defined

In [None]:
fittedgrid.best_params_