In [1]:
import os
import pandas as pd
import numpy as np
from sklearn . linear_model import LogisticRegression
from sklearn . preprocessing import StandardScaler 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from tabulate import tabulate
from sklearn . neighbors import KNeighborsClassifier

##set filenames
input_dir = os.getcwd()
root_dir = os.getcwd()

labels_file ='BAC_weekly_return_volatility_labels'
details_file = 'BAC_weekly_return_volatility_detailed'
labels_path_file = os.path.join(input_dir, labels_file+'.csv')
details_path_file = os.path.join(input_dir,details_file+'.csv')


try:
   labels_df = pd.read_csv(labels_path_file)
   details_df = pd.read_csv(details_path_file)
except Exception as e:
  print(e)
  print('Failed to read input files', labels_file)


## split dataframe into train and test
stock2019_df = labels_df[labels_df['Year'].isin([2019])]
stock2020_df = labels_df[labels_df['Year'].isin([2020])]

## Define features
col_names = ['mean_return','volatility']

# Create test and training datasets
stock_train = stock2019_df[col_names]
stock_test = stock2020_df[col_names]

X_train = stock2019_df[col_names]
Y_train = stock2019_df[['label']]
X_test = stock2020_df[col_names]
Y_test = stock2020_df[['label']]

scaler = StandardScaler().fit(X_train)
x_train = scaler.transform(X_train)

all_accuracy_dict = {}

## calculate accuracy of logistic classifier
log_reg_clf = LogisticRegression ()
log_reg_clf.fit (X_train,np.ravel(Y_train))
log_pred = log_reg_clf.predict(X_test)
all_accuracy_dict['Log'] = round(accuracy_score(Y_test,log_pred)*100,2) 

## Calculate accuracy of knn classifier
scalar = StandardScaler()
scalar.fit(X_train)

knn_classifier = KNeighborsClassifier (n_neighbors =3)
knn_classifier.fit ( X_train,np.ravel(Y_train ))
knn_pred = knn_classifier.predict (X_test )
all_accuracy_dict['KNN'] = round(accuracy_score(Y_test,knn_pred)*100,2)

all_accuracy_df = pd.DataFrame(all_accuracy_dict,index=['All Features'])

## calculate accuracy by dropping features
def calc_acc_dropped_features(log_all_accuracy,knn_all_accuracy):
    log_dict = {}
    knn_dict = {}
    for col in col_names:
        dropped_feature =  col_names.copy()
        dropped_feature.remove(col)
        x_train_dropped =  X_train[dropped_feature].values
        x_test_dropped =  X_test[dropped_feature].values
        scaler = StandardScaler().fit(x_train_dropped)
        x_train_dropped = scaler.transform(x_train_dropped)
        scaler = StandardScaler().fit(x_test_dropped)
        x_test_dropped = scaler.transform(x_test_dropped)
        # run logistic on dropped feature dataset 
        log_reg = LogisticRegression()
        log_reg.fit (x_train_dropped,np.ravel(Y_train))
        log_pred = log_reg.predict(x_test_dropped)
        log_dict[col] = log_all_accuracy - round(accuracy_score(Y_test,log_pred)*100,2)
        
        # run knn on dropped feature dataset
        knn_classifier = KNeighborsClassifier (n_neighbors =3)
        knn_classifier.fit ( x_train_dropped,np.ravel(Y_train ))
        knn_pred = knn_classifier.predict (x_test_dropped )
        knn_dict[col] = knn_all_accuracy -round(accuracy_score(Y_test,knn_pred)*100,2)
  
    dropped_knn_df = pd.DataFrame(knn_dict,index = ['KNN']).transpose()
    dropped_log_df = pd.DataFrame(log_dict,index = ['Log']).transpose()
    drop_df = dropped_knn_df.join(dropped_log_df,how="inner")
    frames = [all_accuracy_df,drop_df]
    drop_df = pd.concat(frames)
    return drop_df
   
dropped_df = calc_acc_dropped_features(all_accuracy_df.loc['All Features','Log'], all_accuracy_df.loc['All Features','KNN'])


## print equation
print('\nQuestion 1\n')
print("Feature contribution to accuracy")
print(tabulate(dropped_df,headers = "keys",tablefmt = "fancy_grid"))





Question 1

Feature contribution to accuracy
╒══════════════╤═══════╤════════╕
│              │   Log │    KNN │
╞══════════════╪═══════╪════════╡
│ All Features │ 96.23 │ 100    │
├──────────────┼───────┼────────┤
│ mean_return  │ 43.4  │  49.06 │
├──────────────┼───────┼────────┤
│ volatility   │  7.55 │   7.55 │
╘══════════════╧═══════╧════════╛
