In [24]:
import pandas as pd
import numpy as np
from src.utils import initialize_model_pipeline, load_data, cross_validate_model

#### Naive Model

#### Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
log_model = initialize_model_pipeline(LogisticRegression())

#### Discriminant analysis

In [27]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
lda_model = initialize_model_pipeline(LinearDiscriminantAnalysis())
qda_pipeline = initialize_model_pipeline(QuadraticDiscriminantAnalysis())

#### The k-Nearest Neighbours Method

In [28]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = initialize_model_pipeline(KNeighborsClassifier(metric='manhattan', n_neighbors=14, weights='uniform'))

#### Tree-based methods 

In [26]:
from sklearn.ensemble import RandomForestClassifier
forest_model = initialize_model_pipeline(RandomForestClassifier(n_estimators=27, max_depth=7, min_samples_split=5,criterion='entropy', random_state=123))

#### Boosting

## Choose model based on best accuracy

In [29]:
from os import cpu_count
data = load_data()
results = []
models = [log_model, forest_model, lda_model, qda_pipeline, knn_model]
labels = ['Logistic Regression', 'LDA', 'QDA', 'KNN', 'Random Forest'] # add naive first and boosting last
for i in range(0, len(models)):
    result = {}
    print(f'running model {i}/{len(models)}')
    cvs = cross_validate_model(models[i], data, data['increase_stock'], n_splits=len(data), cpu_count=cpu_count())
    result['Model'] = labels[i]
    result['Accuracy'] = np.mean(cvs)
    results.append(result)
    print(f'Acc: {np.mean(cvs)}')

running model 0/5
Acc: 0.890625
running model 1/5
Acc: 0.875625
running model 2/5
Acc: 0.8725
running model 3/5




Acc: 0.576875
running model 4/5
Acc: 0.8725


In [30]:
pd.DataFrame(results) # get rid of index

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.890625
1,Random Forest,0.875625
2,LDA,0.8725
3,QDA,0.576875
4,KNN,0.8725
