***COGS 118A FINAL PROJECT:***

In [70]:
#Imports
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo 
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import warnings; warnings.simplefilter('ignore')

In [88]:
  
# fetch heart disease dataset
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
hd_X = heart_disease.data.features
hd_Y = heart_disease.data.targets 
  
#According to the repository, the target 0 indicates no presence of heart disease, while any integer 1,2,3, or 4
#indicates the presence of heart disease. So, we will simplify this into a binary classification with positive and
#negative classes by changing all nonzero targets to +1, and all zero targets to -1.
hd_Y = hd_Y.replace(to_replace=0, value=-1)
hd_Y = hd_Y.replace(to_replace=[1,2,3,4], value=1)
#Our dataset also has missing values. Let's see how many.
num_rows_nan = hd_X.isna().sum()
#print(num_rows_nan)
#Since there are very few nan values, we will just drop those data samples.
nan_index = hd_X.loc[~hd_X.index.isin(hd_X.dropna().index)].index
hd_X = pd.get_dummies(hd_X)
hd_X = hd_X.dropna()
hd_Y = hd_Y.drop(index=nan_index)
#print(hd_X)

#Here, we scale all of our data to be on a normal Gaussian distribution, so they have equal mean and variance. This will ensure
#That each variable in our model has equal weight since it is from the same distribution, thus letting our algorithms perform better.
scaler = StandardScaler().set_output(transform="pandas")
hd_X = scaler.fit_transform(X=hd_X)
print(hd_X.shape)

(297, 13)


In [90]:
    
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
bc_X = breast_cancer_wisconsin_diagnostic.data.features 
bc_Y = breast_cancer_wisconsin_diagnostic.data.targets 

num_rows_nan = bc_X.isna().sum()

#Since there are very few nan values, we will just drop those data samples.
nan_index = bc_X.loc[~bc_X.index.isin(bc_X.dropna().index)].index
bc_X = bc_X.dropna()
bc_Y = bc_Y.drop(index=nan_index)

scaler = StandardScaler().set_output(transform="pandas")
bc_X = scaler.fit_transform(X=bc_X)
print(bc_X.shape)

(569, 30)


In [89]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
ad_X = adult.data.features 
ad_Y = adult.data.targets 
  
# metadata 
#print(adult.metadata) 
  
# variable information 
#print(adult.variables) 
ad_Y = ad_Y.replace(to_replace=['<=50K', '<=50K.'], value=-1)
ad_Y = ad_Y.replace(to_replace=['>50K.', '>50K'], value=1)

num_rows_nan = ad_X.isna().sum()

#Since there are very few nan values, we will just drop those data samples.
nan_index = ad_X.loc[~ad_X.index.isin(ad_X.dropna().index)].index
ad_X=pd.DataFrame(ad_X)
ad_X = ad_X.dropna()
ad_X = pd.get_dummies(ad_X)
ad_Y = ad_Y.drop(index=nan_index)

#Here, we scale all of our data to be on a normal Gaussian distribution, so they have equal mean and variance. This will ensure
#That each variable in our model has equal weight since it is from the same distribution, thus letting our algorithms perform better.
scaler = StandardScaler().set_output(transform="pandas")
ad_X = scaler.fit_transform(X=ad_X)
print(ad_X.shape)


(47621, 108)


In [59]:
#Now, let's get our training and test datasets
from sklearn.model_selection import train_test_split

X = ad_X
Y = ad_Y
test_split = 0.2

In [60]:
hd_X_train, hd_X_test, hd_Y_train, hd_Y_test = train_test_split(X, Y, test_size=test_split)

#sklearn's Logistic Regression
from sklearn.linear_model import LogisticRegression

logistic_classifier = LogisticRegression()
param_grid = [{'C': [1e-8,1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1,1e2,1e3,1e4]}]
grid_search = GridSearchCV(estimator=logistic_classifier, param_grid=param_grid, cv=3)
grid_search.fit(X=hd_X_train, y=hd_Y_train)
print("Best parameters", grid_search.best_params_)
Y_pred = grid_search.predict(hd_X_test)
print("Validation Accuracy:", max(grid_search.cv_results_['mean_test_score']))
print("Training Accuracy:", grid_search.score(hd_X_train, hd_Y_train))
print("Test Accuracy:", grid_search.score(hd_X_test, hd_Y_test))

Best parameters {'C': 100.0}
Validation Accuracy: 0.8504306469100337
Training Accuracy: 0.8517954640907182
Test Accuracy: 0.8532283464566929


In [61]:
hd_X_train, hd_X_test, hd_Y_train, hd_Y_test = train_test_split(X, Y, test_size=test_split)

#sklearn's Artificial Neural Network (ANN)
from sklearn.model_selection import cross_validate 
from sklearn.neural_network import MLPClassifier

param_grid = [{'hidden_layer_sizes': [(1,1),(2,2),(4,4),(8,8),(16,16)]}]

MLP_classifier = MLPClassifier(random_state=1, solver='sgd')
grid_search = GridSearchCV(estimator=MLPClassifier(), param_grid=param_grid, cv=3,
                           scoring='accuracy')
grid_search.fit(X=hd_X_train, y=hd_Y_train)
print("Best parameters", grid_search.best_params_)
Y_pred = grid_search.predict(hd_X_test)
print("Validation Accuracy:", max(grid_search.cv_results_['mean_test_score']))
print("Training Accuracy:", grid_search.score(hd_X_train, hd_Y_train))
print("Test Accuracy:", grid_search.score(hd_X_test, hd_Y_test))


Best parameters {'hidden_layer_sizes': (4, 4)}
Validation Accuracy: 0.8504567695993234
Training Accuracy: 0.8548404031919362
Test Accuracy: 0.8548031496062992


In [62]:
hd_X_train, hd_X_test, hd_Y_train, hd_Y_test = train_test_split(X, Y, test_size=test_split)

#sklearn's KNN classifier
from sklearn.neighbors import KNeighborsClassifier

param_grid = [{'n_neighbors': np.linspace(start=1,stop=X.shape[0], num=26, dtype=int)}]
print(param_grid)
knn_classifier = KNeighborsClassifier()
grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=3)
grid_search.fit(X=hd_X_train, y=hd_Y_train)
print("Best parameters", grid_search.best_params_)
Y_pred = grid_search.predict(hd_X_test.values)
print("Validation Accuracy:", max(grid_search.cv_results_['mean_test_score']))
print("Training Accuracy:", grid_search.score(X=hd_X_train.values, y=hd_Y_train.values))
print("Test Accuracy:", grid_search.score(hd_X_test.values, hd_Y_test.values))


[{'n_neighbors': array([    1,  1905,  3810,  5715,  7620,  9525, 11429, 13334, 15239,
       17144, 19049, 20953, 22858, 24763, 26668, 28573, 30477, 32382,
       34287, 36192, 38097, 40001, 41906, 43811, 45716, 47621])}]
Best parameters {'n_neighbors': 1}
Validation Accuracy: nan
Training Accuracy: 0.9999212515749685
Test Accuracy: 0.7865616797900262


In [63]:
hd_X_train, hd_X_test, hd_Y_train, hd_Y_test = train_test_split(X, Y, test_size=test_split)

#sklearn's Random Forest classifier
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()
param_grid = [{'max_depth': np.arange(1,20)}]
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3)
grid_search.fit(X=hd_X_train, y=hd_Y_train)
print("Best parameters", grid_search.best_params_)
Y_pred = grid_search.predict(hd_X_test)
print("Validation Accuracy:", max(grid_search.cv_results_['mean_test_score']))
print("Training Accuracy:", grid_search.score(hd_X_train, hd_Y_train))
print("Test Accuracy:", grid_search.score(hd_X_test, hd_Y_test))


Best parameters {'max_depth': 17}
Validation Accuracy: 0.863555182540834
Training Accuracy: 0.8913271734565309
Test Accuracy: 0.853753280839895


In [64]:
hd_X_train, hd_X_test, hd_Y_train, hd_Y_test = train_test_split(X, Y, test_size=test_split)

#sklearn's Decision Tree
tree_classifier = tree.DecisionTreeClassifier(criterion='entropy')
D_list = np.arange(1,20)
param_grid = {'max_depth': D_list}
grid_search = GridSearchCV(estimator=tree_classifier,param_grid=param_grid,cv=3)
grid_search.fit(X=hd_X_train, y=hd_Y_train)
print("Best parameters", grid_search.best_params_)
Y_pred = grid_search.predict(hd_X_test)
print("DT Validation Accuracy:", max(grid_search.cv_results_['mean_test_score']))
print("DT Training Accuracy:", grid_search.score(hd_X_train, hd_Y_train))
print("TEST ACCURACY:", grid_search.score(hd_X_test, hd_Y_test))


Best parameters {'max_depth': 9}
DT Validation Accuracy: 0.8540004156560791
DT Training Accuracy: 0.8605365392692146
TEST ACCURACY: 0.8558530183727034


In [82]:
def find_accuracies(estimators, param_grids, test_splits, X, Y):
    for test_split in test_splits:
        train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=test_split)
        print("***TEST PROPORTION:***", test_split)
        print()
        i = 0
        for estimator, param_grid in zip(estimators, param_grids):
            if i==0:
                print("DECISION TREE")
            elif i==1:
                print("RANDOM FOREST")
            elif(i==2):
                print("LOGISTIC REGRESSION")
            elif(i==3):
                print("ANN")
            elif(i==4):
                print("KNN")
            grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=3)
            grid_search.fit(X=train_X.values, y=train_Y.values)
            print("TRAINING ACCURACY:", grid_search.score(train_X.values, train_Y.values))
            print("VALIDATION ACCURACY:", max(grid_search.cv_results_['mean_test_score']))
            print("TEST ACCURACY:", grid_search.score(test_X.values, test_Y.values))
            print("BEST PARAMETERS:", grid_search.best_params_)
            print()
            i+=1

In [87]:
estimators = [tree.DecisionTreeClassifier(), RandomForestClassifier(), LogisticRegression(), MLPClassifier(),KNeighborsClassifier()]
test_splits = [0.2, 0.5, 0.8]
param_grids = []
param_grids.append({'max_depth': np.arange(1,20)})
param_grids.append({'max_depth': np.arange(1,20)})
param_grids.append({'C': [1e-8,1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1,1e2,1e3,1e4]})
param_grids.append({'hidden_layer_sizes': [(1,1),(2,2),(4,4),(8,8),(16,16)]})
param_grids.append({'n_neighbors': np.linspace(start=1,stop=X.shape[0], num=26, dtype=int)})

find_accuracies(estimators=estimators, param_grids=param_grids, test_splits=test_splits, X=bc_X, Y=bc_Y)

***TEST PROPORTION:*** 0.2

DECISION TREE
TRAINING ACCURACY: 0.9758241758241758
VALIDATION ACCURACY: 0.9297083769025211
TEST ACCURACY: 0.9385964912280702
BEST PARAMETERS: {'max_depth': 3}

RANDOM FOREST
TRAINING ACCURACY: 1.0
VALIDATION ACCURACY: 0.9560822586266992
TEST ACCURACY: 0.9473684210526315
BEST PARAMETERS: {'max_depth': 17}

LOGISTIC REGRESSION
TRAINING ACCURACY: 0.9868131868131869
VALIDATION ACCURACY: 0.9802341117694899
TEST ACCURACY: 0.9736842105263158
BEST PARAMETERS: {'C': 1}

ANN
TRAINING ACCURACY: 0.9934065934065934
VALIDATION ACCURACY: 0.9758481468572092
TEST ACCURACY: 0.9736842105263158
BEST PARAMETERS: {'hidden_layer_sizes': (16, 16)}

KNN
TRAINING ACCURACY: 1.0
VALIDATION ACCURACY: 0.9604537004763566
TEST ACCURACY: 0.9298245614035088
BEST PARAMETERS: {'n_neighbors': 1}

***TEST PROPORTION:*** 0.5

DECISION TREE
TRAINING ACCURACY: 1.0
VALIDATION ACCURACY: 0.9436356849570734
TEST ACCURACY: 0.9192982456140351
BEST PARAMETERS: {'max_depth': 13}

RANDOM FOREST
TRAINING AC

In [None]:
estimators = [tree.DecisionTreeClassifier(), RandomForestClassifier(), LogisticRegression(), MLPClassifier(),KNeighborsClassifier()]
test_splits = [0.2, 0.5, 0.8]
param_grids = []
param_grids.append({'max_depth': np.arange(1,20)})
param_grids.append({'max_depth': np.arange(1,20)})
param_grids.append({'C': [1e-8,1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1,1e2,1e3,1e4]})
param_grids.append({'hidden_layer_sizes': [(1,1),(2,2),(4,4),(8,8),(16,16)]})
param_grids.append({'n_neighbors': np.linspace(start=1,stop=X.shape[0], num=26, dtype=int)})

find_accuracies(estimators=estimators, param_grids=param_grids, test_splits=test_splits, X=hd_X, Y=hd_Y)

In [None]:
estimators = [tree.DecisionTreeClassifier(), RandomForestClassifier(), LogisticRegression(), MLPClassifier(),KNeighborsClassifier()]
test_splits = [0.2, 0.5, 0.8]
param_grids = []
param_grids.append({'max_depth': np.arange(1,20)})
param_grids.append({'max_depth': np.arange(1,20)})
param_grids.append({'C': [1e-8,1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1,1e2,1e3,1e4]})
param_grids.append({'hidden_layer_sizes': [(1,1),(2,2),(4,4),(8,8),(16,16)]})
param_grids.append({'n_neighbors': np.linspace(start=1,stop=X.shape[0], num=26, dtype=int)})

find_accuracies(estimators=estimators, param_grids=param_grids, test_splits=test_splits, X=bc_X, Y=bc_Y)