In [1]:
import pandas as pd
import numpy as np
from os import cpu_count
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier 

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
from utils import load_data
data = load_data()
pd.factorize(data['increase_stock'])

(array([0, 0, 0, ..., 0, 1, 0]), Int64Index([0, 1], dtype='int64'))

# initialize_model_pipeline handles all the preprocessing
## (visualize the pipeline by hanging the variable at the end of the cell)

In [3]:
from utils import initialize_model_pipeline
model = initialize_model_pipeline(LogisticRegression(random_state=123, max_iter=10000))
model

## Call fit just like you would any other model

In [4]:
model.fit(data.drop('increase_stock',axis=1), data['increase_stock'])
model.score(X=data, y=data['increase_stock'])

0.89875

### Since everything is in the pipeline, this means that you can even pass in just the original data and it will process it the same way every time (for the X variables)
#### note: this also means you dont get to choose features anymore

In [5]:
model.fit(X=data, y=data['increase_stock'])
model.predict(data)

array([0, 0, 0, ..., 0, 1, 0])

In [6]:
model[:-1].get_feature_names_out()

array(['cat__hour_of_day_0', 'cat__hour_of_day_1', 'cat__hour_of_day_2',
       'cat__hour_of_day_3', 'cat__hour_of_day_4', 'cat__hour_of_day_5',
       'cat__hour_of_day_6', 'cat__hour_of_day_7', 'cat__hour_of_day_8',
       'cat__hour_of_day_9', 'cat__hour_of_day_10', 'cat__hour_of_day_11',
       'cat__hour_of_day_12', 'cat__hour_of_day_13',
       'cat__hour_of_day_14', 'cat__hour_of_day_15',
       'cat__hour_of_day_16', 'cat__hour_of_day_17',
       'cat__hour_of_day_18', 'cat__hour_of_day_19',
       'cat__hour_of_day_20', 'cat__hour_of_day_21',
       'cat__hour_of_day_22', 'cat__hour_of_day_23', 'cat__month_1',
       'cat__month_2', 'cat__month_3', 'cat__month_4', 'cat__month_5',
       'cat__month_6', 'cat__month_7', 'cat__month_8', 'cat__month_9',
       'cat__month_10', 'cat__month_11', 'cat__month_12', 'gen__daytime',
       'gen__rushhour', 'bin__weekday', 'bin__summertime',
       'weather__score'], dtype=object)

## Super easy. Now just change the model as you please

In [7]:
# Set up base KNN model
model_knn = initialize_model_pipeline(KNeighborsClassifier())

# Fit KNN model
model_knn.fit(data, data['increase_stock'])

# Evaluate the KNN model
print(f'KNN Score:')
print(classification_report(data['increase_stock'], model_knn.predict(data)))

KNN Score:
              precision    recall  f1-score   support

           0       0.93      0.96      0.95      1312
           1       0.80      0.65      0.72       288

    accuracy                           0.91      1600
   macro avg       0.86      0.81      0.83      1600
weighted avg       0.90      0.91      0.90      1600



In [8]:
# Initialize model pipeline
model_knn = initialize_model_pipeline(KNeighborsClassifier())

# Parameters for grid search
param_grid = {
    'model__n_neighbors': [3, 5, 7, 9],  # default=5
    'model__weights': ['uniform', 'distance'], # default=’uniform’
    'model__metric': ['euclidean', 'manhattan', 'minkowski'] # default=’minkowski’
}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(model_knn, param_grid) #cv=5
# cross validation default=None
# scoring default=None 
grid_search.fit(data, data['increase_stock'])

# Best parameters
best_params = grid_search.best_params_
print("Best parameters found:", best_params)

# Training the model with the best parameters
model_knn.set_params(**best_params)

model_knn.fit(data, data['increase_stock'])

print(f'KNN Score:')
print(classification_report(data['increase_stock'], model_knn.predict(data)))

Best parameters found: {'model__metric': 'euclidean', 'model__n_neighbors': 9, 'model__weights': 'uniform'}
KNN Score:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      1312
           1       0.81      0.62      0.70       288

    accuracy                           0.91      1600
   macro avg       0.87      0.79      0.82      1600
weighted avg       0.90      0.91      0.90      1600



### Want to run K folds?? Just use the utils function

In [9]:
from utils import cross_validate_model
cross_validate_model(model_knn, data, data['increase_stock'])

array([0.875   , 0.896875, 0.871875, 0.890625, 0.90625 ])

#### Want to run it with more folds?

In [None]:
leave_out_one = cross_validate_model(model_knn, data, data['increase_stock'],n_splits=len(data))

In [None]:
np.mean(leave_out_one)

#### Want to bootstrap your coefficients?
##### use the bootstrap_model function and get all the ranges of your coefficients (average them to bag)D