In [11]:
import pandas as pd
import numpy as np
from os import cpu_count
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier 

In [2]:
from src.utils import load_data
data = load_data()
pd.factorize(data['increase_stock'])

(array([0, 0, 0, ..., 0, 1, 0], dtype=int64), Index([0, 1], dtype='int64'))

# initialize_model_pipeline handles all the preprocessing
## (visualize the pipeline by hanging the variable at the end of the cell)

In [3]:
from src.utils import initialize_model_pipeline
model = initialize_model_pipeline(LogisticRegression(random_state=123, max_iter=10000))
model

## Call fit just like you would any other model

In [4]:
model.fit(data.drop('increase_stock',axis=1), data['increase_stock'])
model.score(X=data, y=data['increase_stock'])

0.89875

### Since everything is in the pipeline, this means that you can even pass in just the original data and it will process it the same way every time (for the X variables)
#### note: this also means you dont get to choose features anymore

In [7]:
model.fit(X=data, y=data['increase_stock'])
model.predict(data)

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

## Super easy. Now just change the model as you please

In [19]:
from sklearn.metrics import classification_report
model_reg = initialize_model_pipeline(LinearRegression())
model_tree = initialize_model_pipeline(RandomForestClassifier(n_jobs=cpu_count()))

model_reg.fit(data, data['increase_stock'])
model_tree.fit(data, data['increase_stock'])

print(f'Lin. Reg. Score:')
print(classification_report(data['increase_stock'],np.round(model_reg.predict(data)).astype(int)))
print(f'Trees Score:')
print(classification_report(data['increase_stock'], model_tree.predict(data)))

Lin. Reg. Score:
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      1312
           1       0.76      0.51      0.61       288

    accuracy                           0.88      1600
   macro avg       0.83      0.74      0.77      1600
weighted avg       0.87      0.88      0.87      1600

Trees Score:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1312
           1       1.00      1.00      1.00       288

    accuracy                           1.00      1600
   macro avg       1.00      1.00      1.00      1600
weighted avg       1.00      1.00      1.00      1600


### Want to run K folds?? Just use the utils function

In [20]:
from src.utils import cross_validate_model
cross_validate_model(model_tree, data, data['increase_stock'])

array([0.8625  , 0.884375, 0.840625, 0.91875 , 0.878125])

#### Want to run it with more folds?

In [22]:
leave_out_one = cross_validate_model(model_tree, data, data['increase_stock'],n_splits=len(data))

In [23]:
np.mean(leave_out_one)

0.87

#### Want to bootstrap your coefficients?
##### use the bootstrap_model function and get all the ranges of your coefficients (average them to bag)D