In [1]:
import sklearn 
import numpy as np 
import pandas as pd
import plotly.express as px

In [2]:
import sklearn 
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("FEATURES_FINAL.csv")
X_df = df.iloc[:, 3:]
y_df = df.iloc[:, 2]

In [7]:
# Normalize Data 
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
X_df = std.fit_transform(X_df)

In [9]:
# Create Validation Set
X_train, X_val, y_train, y_val = train_test_split(X_df, y_df, test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [10]:
def model_eval(algorithm):
    '''
        function that takes in a classifier model and returns metrics based on prediciton of validation data 
        parameters:
                    algorithm is classifier with paramaters (e.g, sklearn.naive_bayes.MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None))
        returns: 
                    accuracy, macro F1, and weighted F1 scores
    ''' 
    
    # Fit the Model
    alg = algorithm 
    alg.fit(X_train, y_train)
    
    # Predict y_val from X_val
    y_val_pred = alg.predict(X_val)
    
    # Calculate accuracy, macro F1, and weighted F1 metrics
    acc = sklearn.metrics.accuracy_score(y_val, y_val_pred, normalize=True, sample_weight=None)
    mf1 = sklearn.metrics.f1_score(y_val, y_val_pred, average='macro') 
    wf1 = sklearn.metrics.f1_score(y_val, y_val_pred, average='weighted') 
    
    return acc, mf1, wf1

# Initial Models

In [11]:
# Random Forest:
from sklearn import ensemble 
rf = sklearn.ensemble.RandomForestClassifier() # make 10, bc takes long?
rf_acc, rf_macro, rf_wei = model_eval(rf) 
print("\n Random Forest ", rf_acc, rf_macro, rf_wei)

# K Nearest Neighbors 
from sklearn import neighbors 
knn = sklearn.neighbors.KNeighborsClassifier() 
knn_acc, knn_macro, knn_wei = model_eval(knn) 
print("\n K Nearest Neighbors", knn_acc, knn_macro, knn_wei)

# Decision Tree 
from sklearn import tree 
dt = sklearn.tree.DecisionTreeClassifier() 
dt_acc, dt_macro, dt_wei = model_eval(dt) 
print("\n Decision Tree", dt_acc, dt_macro, dt_wei)


 Random Forest  0.7058823529411765 0.38271604938271603 0.681917211328976

 K Nearest Neighbors 0.7058823529411765 0.27586206896551724 0.6328600405679513

 Decision Tree 0.4117647058823529 0.40404040404040403 0.45632798573975053


# TUNING HYPERPARAMETERS 

In [12]:
# Random Forest -- Tuning Hyperparameters 
from sklearn import ensemble
rf_new = sklearn.ensemble.RandomForestClassifier(n_estimators = 5)
    
# Parameters to Manipulate  
    # Number of Trees
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 5)]
    # Number of features to consider at every split
max_features = ['auto']
    # Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
    # Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
bootstrap = [True, False]

# Create parameter distributions
rf_param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Random Forest -- Random Search
from sklearn.model_selection import RandomizedSearchCV
rf_rand = RandomizedSearchCV(estimator = rf_new, param_distributions = rf_param, cv = 3, verbose = 2, n_jobs = 4)
rf_rand.fit(X_train, y_train)

# Random Forest -- Random Search Results 
print(rf_rand.best_params_)
print(rf_rand.best_estimator_)

# Random Forest -- New Metrics 
rf_new_acc, rf_new_macro, rf_new_wei = model_eval(rf_rand.best_estimator_)
print("\n Hypertuned Random Forest ", rf_new_acc, rf_new_macro, rf_new_wei)

# Create "Final" RF with Best Estimator 
rf_final = rf_rand.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
{'n_estimators': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10, 'bootstrap': True}
RandomForestClassifier(max_depth=10, n_estimators=10)

 Hypertuned Random Forest  0.47058823529411764 0.26956521739130435 0.50076726342711


In [13]:
# K Nearest Neighbors 
from sklearn import neighbors 
knn_new = sklearn.neighbors.KNeighborsClassifier()

# Create parameter distributions 
knn_param = {'leaf_size': list(range(1,50)), 
             'n_neighbors': list(range(1,30)), 
             'p': [1,2]}

# KNN -- Random Search 
knn_rand = RandomizedSearchCV(estimator = knn_new, param_distributions = knn_param, cv = 5)
knn_rand.fit(X_train, y_train)

# KNN -- Random Search Results 
print(knn_rand.best_params_)
print(knn_rand.best_estimator_)

# KNN -- New Metrics 
knn_new_acc, knn_new_macro, knn_new_wei = model_eval(knn_rand.best_estimator_)
print("\n Hypertuned KNN ", knn_new_acc, knn_new_macro, knn_new_wei)

# Create "Final" KNN with Best Estimator 
knn_final = knn_rand.best_estimator_

Traceback (most recent call last):
  File "C:\Users\prsah\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\prsah\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 397, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "C:\Users\prsah\anaconda3\lib\site-packages\sklearn\base.py", line 500, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "C:\Users\prsah\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 197, in predict
    neigh_dist, neigh_ind = self.kneighbors(X)
  File "C:\Users\prsah\anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 680, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples,  but n_samples = 23, n_neighbors = 28

Traceback (most recent call last):
  File "C:\Users\prsah\anaconda3\lib\site-packages\sklearn\model_selection\_valid

{'p': 2, 'n_neighbors': 19, 'leaf_size': 13}
KNeighborsClassifier(leaf_size=13, n_neighbors=19)

 Hypertuned KNN  0.7647058823529411 0.28888888888888886 0.6627450980392157


In [14]:
# Decision Tree 
from sklearn import tree 
dt_new = sklearn.tree.DecisionTreeClassifier()

# Create parameter distributions 
dt_param = {"max_depth": [3, None],
            "min_samples_leaf": list(range(1,9)),
            "criterion": ["gini", "entropy"]}

# DT -- Random Search 
dt_rand = RandomizedSearchCV(estimator = dt_new, param_distributions = dt_param)
dt_rand.fit(X_train, y_train)

# DT -- Random Search Results 
print(dt_rand.best_params_)
print(dt_rand.best_estimator_)

# DT -- New Metrics 
dt_new_acc, dt_new_macro, dt_new_wei = model_eval(dt_rand.best_estimator_)
print("\n Hypertuned DT ",dt_new_acc, dt_new_macro, dt_new_wei)

# Create "Final" DT with Best Estimator 
dt_final = dt_rand.best_estimator_

{'min_samples_leaf': 3, 'max_depth': 3, 'criterion': 'gini'}
DecisionTreeClassifier(max_depth=3, min_samples_leaf=3)

 Hypertuned DT  0.4117647058823529 0.44949494949494956 0.45365418894830656




In [15]:
print("\n Algorithm Name     ", "Accuracy", "          Macro F1", "          Weighted F1")

rf_fin_acc, rf_fin_macro, rf_fin_wei = model_eval(rf_final)
knn_fin_acc, knn_fin_macro, knn_fin_wei = model_eval(knn_final)
dt_fin_acc, dt_fin_macro, dt_fin_wei = model_eval(dt_final)

print("\n Hypertuned Random Forest ", rf_fin_acc, rf_fin_macro, rf_fin_wei)
print("\n Hypertuned KNN ", knn_fin_acc, knn_fin_macro, knn_fin_wei)
print("\n Hypertuned DT ",dt_fin_acc, dt_fin_macro, dt_fin_wei)


 Algorithm Name      Accuracy           Macro F1           Weighted F1

 Hypertuned Random Forest  0.7058823529411765 0.3772893772893773 0.6974789915966387

 Hypertuned KNN  0.7647058823529411 0.28888888888888886 0.6627450980392157

 Hypertuned DT  0.4117647058823529 0.44949494949494956 0.45365418894830656


In [16]:
# Actual Predictions?: 
print(X_test)
final_predictions = dt_final.predict(X_test)
print(final_predictions)

[[-0.52966672  0.91766294 -0.1435222  -0.85599914 -0.83473955 -0.82736328
  -0.84741546 -0.76807383]
 [-0.13381059  0.91766294 -0.14290671 -0.21072151 -0.51604375 -0.10144859
  -0.58300899 -0.63644819]
 [-0.64837366 -1.3764944  -0.1433753  -0.65465197 -0.72794412  0.52197848
  -0.78314708 -0.71776529]
 [ 0.02066147 -0.22941573 -0.14272447 -0.47910316 -0.17032191 -0.38482452
  -0.12310818  0.31657899]]
[1 2 2 1]
