# Machine Learning Classifiers
***

#### Table of Contents:
- [Library Imports](#Library-Imports)
- [Preprocessing the Data](#Preprocessing-the-Data)
- [Models](#Models)
    - [Logistic Regression](#Logistic-Regression)
    - [Decision Tree](#Decision-Tree)
    - [XGBoost](#XGBoost)
- [Looping Through the Data](#Looping-Through-the-Data)
- [Results](#Results)


### Library Imports
***

In [2]:
import requests
import io
import pandas as pd
import numpy as np
import pandas_ta as ta
import matplotlib.pyplot as plt

%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, accuracy_score, recall_score, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from os import listdir
from os.path import isfile, join

import warnings
warnings.filterwarnings("ignore")

### Preprocessing the Data
***

In [3]:
# This function removes highly correlated variables within the dataset. The function looks at the 
# Pearson Correlation Coefficient between every pair of variables in the dataset and returns all 
# of those between .8 and .999. While working with data from different companies, I found that by
# removing the first 90% of correlated variables within the DataFrame, in almost all cases, there
# were no longer correlations within the dataset.

def remove_corrs(df):
    
    corrs = df.corr().stack().reset_index()
    corrs.columns = ['1','2','R2']
    temp = corrs[(corrs.R2 > .8) & (corrs.R2 < .999)].sort_values('R2', ascending = False).reset_index(drop = True)
    correlations = temp[temp.index % 2 == 0]
    
    corr_index = correlations['1'].value_counts().index
    to_drop = round(len(corr_index)*.9)
    
    df2 = df.drop(corr_index[:to_drop], axis = 1)
    
    return df2

In [7]:
# This function preprocesses my data so it is ready for distanced based classifiers.

def preprocess_data(df, target_var = 'close'):
    
    df.index = pd.to_datetime(df.index)
    
    # Here, I shift all the independent variables back one day in order to make sure there is
    # no data leakage. Because we're trying to predict closing price, we can't know many of
    # things that happen throughout the day we're trying to predict.
    X = df.shift(1).dropna()
    
    # This is the function described above.
    X_data = remove_corrs(X)

    # Because I am running a regression, I can use a continuous variable as my dependent variable.
    # Ideally, my algorithim can predict the closing price at the end of the day with accuracy.
    y = df[target_var].iloc[1:]
    
    # Do a train test split with the first 80% of the data being the training set and the last 20%
    # as the testing set.
    train_num = round(len(X)*.8)
    test_num = round(len(X)*.2)
    print(train_num, test_num)

    X_train = X_data.iloc[:train_num]
    X_test = X_data.iloc[-test_num:]
    y_train = y.iloc[:train_num]
    y_test = y.iloc[-test_num:]
    
    # Because I am doing distance based regressions, I need to scale the data so that variables with
    # higher absolute values don't dominate the metrics.
    ss = StandardScaler()
    X_train_scaled = ss.fit_transform(X_train)
    X_test_scaled = ss.transform(X_test)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns).set_index(X_train.index)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns).set_index(X_test.index)
    
    return X_train, X_test, y_train, y_test

## Models
***

### Logistic Regression

In [None]:
def logreg(X_train, X_test, y_train, y_test, cv=5):
    
    # Set GridSearchCV hyperparameters to compare & select
    grid = {
    'penalty': ['l1', 'l2' ,'elasticnet'],
    'solver': ['newton-cg','lbfgs', 'liblinear', 'sag', 'saga']}
    
    # Instantiate & fit LogReg model for GridSearch
    grid_logreg = LogisticRegression(random_state=42)
    
    # Instantiate & fit GridSearchCV with accuracy scoring
    gs = GridSearchCV(estimator=grid_logreg, param_grid=grid, cv=cv,
                      scoring='accuracy', n_jobs = -1, error_score = 0)
    gs.fit(X_train, y_train)
        
    # Create prediction variable using test data
    y_pred = gs.predict(X_test)
    
    # Run cross-validate score with cv folds from function parameter
    cv_results = cross_val_score(gs, X_train, y_train, cv=cv).mean()

    # Run and print accuracy, recall, precision and f1 scores
    train_score = gs.score(X_train, y_train)
    test_score = gs.score(X_test, y_test)

    print(f'Train Mean Accuracy: {train_score}')
    print(f'Mean Cross-Val Accuracy: {cv_results}')
    print(f'Test Mean Accuracy: {test_score}')
    
    # Plot Confusion Matrix
    plot_confusion_matrix(gs, X_train, y_train)
    plot_confusion_matrix(gs, X_test, y_test)
    
    results = ['random forest', train_score, cv_results, test_score]
    
    return results

### Decision Tree

In [6]:
def dtree(X_train, X_test, y_train, y_test, cv=5):
    
    # Set GridSearchCV hyperparameters to compare & select
    grid = {
    'max_depth': [3,10,15],
    'min_samples_split': [2,8,10,15],
    'criterion': ['gini', 'entropy']}
    
    # Instantiate & fit Decision Tree model for GridSearch
    grid_dt = DecisionTreeClassifier()
    grid_dt.fit(X_train, y_train)
    
    # Instantiate & fit GridSearchCV with accuracy scoring
    gs = GridSearchCV(estimator=grid_dt, param_grid=grid, cv=cv, scoring='accuracy', n_jobs = -1)
    gs.fit(X_train, y_train)

    # Create prediction variable using test data
    y_pred = gs.predict(X_test)
    
    # Run cross-validate score with cv folds from function parameter
    cv_results = cross_val_score(gs, X_train, y_train, cv=cv).mean()

    # Run and print accuracy, recall, precision and f1 scores
    train_score = gs.score(X_train, y_train)
    test_score = gs.score(X_test, y_test)
    
    print(f'Train Mean Accuracy: {train_score}')
    print(f'Mean Cross-Val Accuracy: {cv_results}')
    print(f'Test Mean Accuracy: {test_score}')
    
    # Plot Confusion Matrix
    plot_confusion_matrix(gs, X_train, y_train)
    plot_confusion_matrix(gs, X_test, y_test)
    
    results = ['decision tree', train_score, cv_results, test_score]
    
    return results

### XGBoost

In [8]:
def xgboost(X_train, X_test, y_train, y_test, cv=5):
    
    # Set GridSearchCV hyperparameters to compare & select
    grid = {
    'learning_rate': [.01,.05,.1,.5,1],
    'max_depth': [4],
    'min_child_weight': [3],
    'subsample': [1],
    'n_estimators': [100,500]}
    
    # Instantiate & fit XGClassifier
    xgb = XGBClassifier(verbosity=0, random_state=42)
    #xgb.fit(X_train, y_train)
    
    # Instantiate & fit GridSearchCV with accuracy scoring
    gs = GridSearchCV(estimator=xgb, param_grid=grid, cv=cv, scoring='accuracy', n_jobs = -1)
    gs.fit(X_train, y_train)
    
    # Create prediction variable using test data
    y_pred = gs.predict(X_test)
    
    # Run cross-validate score with cv folds from function parameter
    cv_results = cross_val_score(gs, X_train, y_train, cv=cv).mean()

    # Run and print accuracy, recall, precision and f1 scores
    train_score = gs.score(X_train, y_train)
    test_score = gs.score(X_test, y_test)

    print(f'Train Mean Accuracy: {train_score}')
    print(f'Mean Cross-Val Score: {cv_results}')
    print(f'Test Mean Accuracy: {test_score}')

    
    plot_confusion_matrix(gs, X_train, y_train)
    plot_confusion_matrix(gs, X_test, y_test);
    
    results = ['xgboost', train_score, cv_results, test_score]
    
    return results

### Looping Through the Data
***
This is the code that was used to loop through the data and get the final csv files with the regression results. Due to time and processing power constraints, the code was run using [Google Colab](#https://colab.research.google.com/?utm_source=scs-index).

In [10]:
# get the files in the Formatted Data folder.
files = [f for f in listdir('.\FormattedData')]

#remove the ipynb checkpoint file
files.pop(0)

# create a list of companies that can be analyzed.
companies = []
for i in files:
    company = i.split('.')[0]
    companies.append(company)

# create a list of random companies to analyze from within the formatted companies
test_companies = np.random.choice(companies, 5, replace = False)

In [6]:
# loop through the companies and perform the different alogrithims. Create a
# list of DataFrames that can then be used to compared results across different
# companies and algorithms.

results = []

for c in test_companies:
    csv_for_df = f'FormattedData/{c}.csv'
    df = pd.read_csv(csv_for_df, index_col = 0)
    X_train, X_test, y_train, y_test = preprocess_data(df)
    logreg_results = logreg(X_train, X_test, y_train, y_test)
    dtree_results = dtree(X_train, X_test, y_train, y_test)
    randomforest_results = random_forest(X_train, X_test, y_train, y_test)
    xgboost_results = xgboost(X_train, X_test, y_train, y_test)
    c_results = pd.DataFrame([logreg_results, dtree_results, randomforest_results, xgboost_results],
            columns = ['Model Type', 'Train Accuracy', 'Cross Val Accuracy','Test Accuracy'])
    c_results['company'] = c.split('_')[0]
    results.append(c_results)

NameError: name 'test_companies' is not defined

### Results