In [11]:
import pandas as pd
from feature_creation import data_copy
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import seaborn as sns

# Train/Build model for basic logistic regression

In [2]:
data_copy['increase_stock'] = data_copy['increase_stock'].apply(lambda entity: 1 if entity == 'high_bike_demand' else 0)
data_copy['increase_stock'] = data_copy['increase_stock'].astype('int')
data_copy.describe()

Unnamed: 0,holiday,weekday,summertime,temp,dew,humidity,precip,snowdepth,windspeed,cloudcover,visibility,increase_stock,daytime,rushhour,weather_score,weather_score_daytime
count,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0
mean,0.033125,0.71,0.64375,15.210313,7.75075,63.927844,0.122042,0.042713,13.0825,64.322375,15.344125,0.18,0.54375,0.205625,0.18,0.127378
std,0.179019,0.453904,0.47904,9.264785,10.026459,19.079419,0.9206,0.421198,7.756652,32.748869,2.323737,0.384308,0.498238,0.404284,0.184322,0.189459
min,0.0,0.0,0.0,-9.1,-18.4,15.85,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,-0.239787,-0.160192
25%,0.0,0.0,0.0,7.7,-0.8,47.845,0.0,0.0,7.5,28.8,16.0,0.0,0.0,0.0,0.049945,0.0
50%,0.0,1.0,1.0,15.5,8.3,65.175,0.0,0.0,12.3,79.3,16.0,0.0,1.0,0.0,0.154343,0.0
75%,0.0,1.0,1.0,23.2,16.8,79.955,0.0,0.0,17.6,92.8,16.0,0.0,1.0,0.0,0.278779,0.225751
max,1.0,1.0,1.0,35.6,24.3,99.89,25.871,6.71,43.8,100.0,16.0,1.0,1.0,1.0,0.956538,0.956538


In [12]:
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   hour_of_day            1600 non-null   category
 1   day_of_week            1600 non-null   category
 2   month                  1600 non-null   category
 3   holiday                1600 non-null   int64   
 4   weekday                1600 non-null   int64   
 5   summertime             1600 non-null   int64   
 6   temp                   1600 non-null   float64 
 7   dew                    1600 non-null   float64 
 8   humidity               1600 non-null   float64 
 9   precip                 1600 non-null   float64 
 10  snowdepth              1600 non-null   float64 
 11  windspeed              1600 non-null   float64 
 12  cloudcover             1600 non-null   float64 
 13  visibility             1600 non-null   float64 
 14  increase_stock         1600 non-null   i

In [13]:
import os
num_cores = os.cpu_count()
print(f'Number of cores: {num_cores}')

Number of cores: 8


In [14]:
X = data_copy.drop('increase_stock',axis=1)
Y = data_copy['increase_stock']

preprocessor = ColumnTransformer([('cat',OneHotEncoder(), list(X.select_dtypes(include='category').columns))],
                                 remainder='passthrough')
logreg_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=10000))
])


In [15]:
kf = KFold(n_splits=100, shuffle=True, random_state=123)
cross_val_results = cross_val_score(logreg_model, X, Y, cv=kf, scoring='accuracy',n_jobs=num_cores) #run in parallel

# Print the cross-validation results
print(f'Cross-validation results: {cross_val_results}')
print(f'Mean accuracy: {np.mean(cross_val_results)}')

Cross-validation results: [0.75   0.875  0.875  0.875  1.     0.875  0.875  0.875  0.875  0.9375
 0.9375 1.     0.875  0.9375 0.875  0.8125 0.75   0.9375 0.6875 0.9375
 1.     0.8125 0.9375 0.875  0.9375 0.875  0.75   0.875  0.875  0.875
 0.875  0.9375 0.9375 0.8125 0.9375 0.875  0.9375 0.875  0.9375 1.
 0.9375 0.8125 1.     0.9375 0.9375 0.875  0.9375 0.9375 0.9375 0.75
 0.8125 0.875  0.8125 0.875  0.875  0.8125 0.875  1.     0.9375 0.8125
 0.9375 0.9375 0.8125 0.8125 1.     0.875  0.875  0.875  0.875  0.875
 1.     0.9375 0.9375 1.     0.9375 1.     0.875  0.9375 0.875  0.9375
 0.9375 0.8125 0.875  1.     0.75   0.9375 0.875  0.875  0.9375 0.875
 0.8125 0.9375 1.     0.8125 1.     0.9375 1.     1.     0.9375 0.875 ]
Mean accuracy: 0.8975


In [16]:
# Mean accuracy: 0.8971232020012507 k=N
pd.DataFrame(cross_val_results).describe()

Unnamed: 0,0
count,100.0
mean,0.8975
std,0.069108
min,0.6875
25%,0.875
50%,0.875
75%,0.9375
max,1.0


In [17]:
logreg_model.fit(X,Y)
processed_names = logreg_model[:-1].get_feature_names_out()
logreg_coefs = logreg_model.named_steps["classifier"].coef_[0]
print(f'col len: {len(processed_names)} coef len: {len(logreg_coefs)}')
for item in zip(processed_names,logreg_coefs):
    print(f'{item[0]} = {item[1]}')

col len: 58 coef len: 58
cat__hour_of_day_0 = -0.53558370674982
cat__hour_of_day_1 = -0.4911590850877324
cat__hour_of_day_2 = -0.30778551013834243
cat__hour_of_day_3 = -0.3347904312469982
cat__hour_of_day_4 = -0.40936158881745977
cat__hour_of_day_5 = -0.3007032151054789
cat__hour_of_day_6 = -0.29139162198099894
cat__hour_of_day_7 = -1.0392324723300728
cat__hour_of_day_8 = 0.8904578576516211
cat__hour_of_day_9 = 0.700642315495557
cat__hour_of_day_10 = -0.18671599245940854
cat__hour_of_day_11 = 0.3209915386015567
cat__hour_of_day_12 = 0.6304520937990155
cat__hour_of_day_13 = -0.04350274150062647
cat__hour_of_day_14 = 0.035635864685797214
cat__hour_of_day_15 = -0.2514492264653028
cat__hour_of_day_16 = -0.043814291446107476
cat__hour_of_day_17 = 0.6726629813534225
cat__hour_of_day_18 = 1.1002552933580485
cat__hour_of_day_19 = -0.0002926155776033071
cat__hour_of_day_20 = 1.0526098731114288
cat__hour_of_day_21 = -0.6472396231208094
cat__hour_of_day_22 = 0.03468574640483154
cat__hour_of_day_2

In [19]:
y_hats = cross_val_predict(logreg_model,X=X,y=Y,cv=160)

In [21]:
print(classification_report(Y,y_hats))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1312
           1       0.76      0.64      0.69       288

    accuracy                           0.90      1600
   macro avg       0.84      0.80      0.82      1600
weighted avg       0.89      0.90      0.89      1600


### Now try with original features to compare if our features helped or hurt

In [9]:
data_og = pd.read_csv('../data/training_data.csv')
data_og['increase_stock'] = data_og['increase_stock'].astype('category')
data_og['month'] = data_og['month'].astype('category')
data_og['day_of_week'] = data_og['day_of_week'].astype('category')
data_og = data_og.drop('snow', axis=1) #no information in this column
# data_og['is_high_demand'] = data_og['increase_stock'].apply(lambda entity: 1 if entity == 'high_bike_demand' else 0)
# data_og['is_high_demand'] = data_og['is_high_demand'].astype('int')
data_og.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   hour_of_day     1600 non-null   int64   
 1   day_of_week     1600 non-null   category
 2   month           1600 non-null   category
 3   holiday         1600 non-null   int64   
 4   weekday         1600 non-null   int64   
 5   summertime      1600 non-null   int64   
 6   temp            1600 non-null   float64 
 7   dew             1600 non-null   float64 
 8   humidity        1600 non-null   float64 
 9   precip          1600 non-null   float64 
 10  snowdepth       1600 non-null   float64 
 11  windspeed       1600 non-null   float64 
 12  cloudcover      1600 non-null   float64 
 13  visibility      1600 non-null   float64 
 14  increase_stock  1600 non-null   category
dtypes: category(3), float64(8), int64(4)
memory usage: 155.7 KB


In [10]:
# Perform k-fold cross-validation and evaluate the model
X_og = data_og.drop('increase_stock',axis=1)
Y_og = data_og['increase_stock']

preprocessor_og = ColumnTransformer(transformers=[('cat', OneHotEncoder(),list(X_og.select_dtypes(include='category').columns))], 
                                    remainder='passthrough')

log_model = Pipeline([
    ('preprocessor', preprocessor_og),
    ('classifier', LogisticRegression(max_iter=10000))
]) 


In [11]:
cross_val_results = cross_val_score(log_model, X_og, Y_og, cv=kf, scoring='accuracy',n_jobs=num_cores) #run in parallel

# Print the cross-validation results
print(f'Cross-validation results: {cross_val_results}')
print(f'Mean accuracy: {np.mean(cross_val_results)}')

Cross-validation results: [0.80625 0.83125 0.84375 0.89375 0.88125 0.8375  0.85625 0.8625  0.8625
 0.8875 ]
Mean accuracy: 0.85625


In [12]:
log_model.fit(X_og,Y_og)
# log_model.named_steps['preprocessor_og'].transform(X_og) #peek preprocessing data
# log_model[:-1].get_feature_names_out() #show encoded vars

In [13]:
print(f'col len: {len(log_model[:-1].get_feature_names_out())} coef len: {len(log_model.named_steps["classifier"].coef_[0])}')
list(zip(log_model[:-1].get_feature_names_out(),log_model.named_steps["classifier"].coef_[0]))

col len: 31 coef len: 31


[('cat__day_of_week_0', 0.10258851871504138),
 ('cat__day_of_week_1', 0.15531978092981763),
 ('cat__day_of_week_2', 0.1674210710274114),
 ('cat__day_of_week_3', 0.16767210686511383),
 ('cat__day_of_week_4', -0.018626549811301095),
 ('cat__day_of_week_5', -0.406152372389761),
 ('cat__day_of_week_6', -0.1177811540626271),
 ('cat__month_1', -0.23257416330425848),
 ('cat__month_2', -0.1892881507525128),
 ('cat__month_3', -0.14502933567674375),
 ('cat__month_4', -0.28784260853061167),
 ('cat__month_5', 0.8711757943915299),
 ('cat__month_6', 0.4602345011491685),
 ('cat__month_7', 1.2680920786208332),
 ('cat__month_8', 0.8529830735679239),
 ('cat__month_9', -0.41197295617223373),
 ('cat__month_10', -1.0796627357926871),
 ('cat__month_11', -0.7763330976627653),
 ('cat__month_12', -0.27934099856252914),
 ('remainder__hour_of_day', -0.04729726749177807),
 ('remainder__holiday', -0.1792738611523768),
 ('remainder__weekday', 0.5743749277263772),
 ('remainder__summertime', -0.4613026955738188),
 ('