## Import the necessary libraries 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

## Load and Cleanup Datasets

In [4]:
breast_cancer_data = pd.read_csv("COGS118A_FINAL/wdbc.data", header=None)

In [5]:
columns = [
    "ID", "Diagnosis", "mean_radius", "mean_texture", "mean_perimeter", "mean_area", "mean_smoothness", 
    "mean_compactness", "mean_concavity", "mean_concave_points", "mean_symmetry", "mean_fractal_dimension", 
    "se_radius", "se_texture", "se_perimeter", "se_area", "se_smoothness", 
    "se_compactness", "se_concavity", "se_concave_points", "se_symmetry", "se_fractal_dimension", 
    "worst_radius", "worst_texture", "worst_perimeter", "worst_area", "worst_smoothness", 
    "worst_compactness", "worst_concavity", "worst_concave_points", "worst_symmetry", "worst_fractal_dimension"
]

In [6]:
breast_cancer_data.columns = columns

In [7]:
encoder = LabelEncoder()
breast_cancer_data['Diagnosis'] = encoder.fit_transform(breast_cancer_data['Diagnosis'])

In [8]:
breast_cancer_data.head()

Unnamed: 0,ID,Diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [9]:
parkinsons_data = pd.read_csv("COGS118A_FINAL/parkinsons.data")

In [10]:
parkinsons_data.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [11]:
heart_data = pd.read_csv("COGS118A_FINAL/heart.dat", sep=" ", header=None)

In [12]:
columns = [
    "age", "sex", "chest_pain_type", "resting_blood_pressure", "serum_cholestoral",
    "fasting_blood_sugar", "resting_electrocardiographic", "max_heart_rate",
    "angina", "oldpeak", "slope", "major_vessels", "thal", "heart_disease"
]

In [13]:
heart_data.columns = columns

In [14]:
heart_data.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_electrocardiographic,max_heart_rate,angina,oldpeak,slope,major_vessels,thal,heart_disease
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,2
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,1
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,2
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,1
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,1


In [15]:
cancer_surgery = pd.read_csv("COGS118A_FINAL/haberman.data", header=None)

In [16]:
columns2 = ["age", "operation_year", "pos_auxillary_nodes", "survival_status"]

In [17]:
cancer_surgery.columns = columns2

In [18]:
cancer_surgery.head()

Unnamed: 0,age,operation_year,pos_auxillary_nodes,survival_status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


### The first dataset I will be working with is Breast Cancer Data

Below I will begin by partitioning the data and then I will train it to three classifiers Logistic Regression, Random Forest, and SVM.

In [21]:
X = breast_cancer_data.drop(columns=["ID", "Diagnosis"])

In [22]:
y = breast_cancer_data["Diagnosis"]

In [23]:
X_train_20, X_test_80, y_train_20, y_test_80 = train_test_split(X, y, test_size=0.80, random_state=11) #20/80 split
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X, y, test_size=0.50, random_state=11) #50/50 split
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, y, test_size=0.20, random_state=11) #80/20 split

Here I will scale my data before applying the Logistic Regression Model

In [25]:
#Scale Data
scalar = StandardScaler()
X_train_20_s = scalar.fit_transform(X_train_20)
X_test_80_s = scalar.transform(X_test_80)

X_train_50_s = scalar.fit_transform(X_train_50)
X_test_50_s = scalar.transform(X_test_50)

X_train_80_s = scalar.fit_transform(X_train_80)
X_test_20_s = scalar.transform(X_test_20)

In [26]:
def logistic_regression(X_train, y_train, X_test, y_test):

    #sklearn logistic regression
    lr = LogisticRegression() 
 
    param_grid = {
     "penalty" : ["l2"],
     "solver" : ["lbfgs"],
     "C": [1, 10, 100],
     "max_iter": [100, 1000, 2000, 5000,10000]
    }

    # cross validation 5-folds
    grid_search = GridSearchCV(lr, param_grid, cv=5, scoring="accuracy", return_train_score=True) 
    best_grid_search = grid_search.fit(X_train, y_train)
    best_lr = best_grid_search.best_estimator_
    
    train_accuracy = best_lr.score(X_train, y_train)
    cross_validation = best_grid_search.best_score_
    y_pred = best_lr.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
                    
    return train_accuracy, test_accuracy, cross_validation

In [27]:
train_accuracy1_lr1, test_accuracy1_lr1, cross_validation1_lr1 = logistic_regression(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_lr1, test_accuracy2_lr1, cross_validation2_lr1 = logistic_regression(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_lr1, test_accuracy3_lr1, cross_validation3_lr1 = logistic_regression(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_lr1, test_accuracy4_lr1, cross_validation4_lr1 = logistic_regression(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_lr1, test_accuracy5_lr1, cross_validation5_lr1 = logistic_regression(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_lr1, test_accuracy6_lr1, cross_validation6_lr1 = logistic_regression(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_lr1, test_accuracy7_lr1, cross_validation7_lr1 = logistic_regression(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_lr1, test_accuracy8_lr1, cross_validation8_lr1 = logistic_regression(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_lr1, test_accuracy9_lr1, cross_validation9_lr1 = logistic_regression(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_lr1 = (train_accuracy1_lr1 + train_accuracy2_lr1 + train_accuracy3_lr1 + train_accuracy4_lr1 + train_accuracy5_lr1 + train_accuracy6_lr1 + train_accuracy7_lr1 + train_accuracy8_lr1 +train_accuracy9_lr1) / 9

cv_average_lr1 = (cross_validation1_lr1 + cross_validation2_lr1 + cross_validation3_lr1 + cross_validation4_lr1 + cross_validation5_lr1 + cross_validation6_lr1 + cross_validation7_lr1 + cross_validation8_lr1 + cross_validation9_lr1) / 9

test_average_lr1 = (test_accuracy1_lr1 + test_accuracy2_lr1 + test_accuracy3_lr1 + test_accuracy4_lr1 + test_accuracy5_lr1 + test_accuracy6_lr1 + test_accuracy7_lr1 + test_accuracy8_lr1 + test_accuracy9_lr1) / 9

results = pd.DataFrame({
    "Parition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_lr1, train_accuracy2_lr1, train_accuracy3_lr1, train_accuracy4_lr1, train_accuracy5_lr1, train_accuracy6_lr1, train_accuracy7_lr1, train_accuracy8_lr1, train_accuracy9_lr1],
    "Test Accuracy": [test_accuracy1_lr1, test_accuracy2_lr1, test_accuracy3_lr1, test_accuracy4_lr1, test_accuracy5_lr1, test_accuracy6_lr1, test_accuracy7_lr1, test_accuracy8_lr1, test_accuracy9_lr1],
    "Cross Validation Accuracy": [cross_validation1_lr1, cross_validation2_lr1, cross_validation3_lr1, cross_validation4_lr1, cross_validation5_lr1, cross_validation6_lr1, cross_validation7_lr1, cross_validation8_lr1, cross_validation9_lr1]
})

averages_lr1 = pd.DataFrame({
    "Train Accuracy Average": [train_average_lr1], 
    "Cross Validation Accuracy Average": [cv_average_lr1],
    "Test Accuracy Average": [test_average_lr1]
})

print(results)
print(averages_lr1)

  Parition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0    20/80        0.991150       0.962719                   0.973518
1    50/50        0.991150       0.962719                   0.973518
2    80/20        0.991150       0.962719                   0.973518
3    20/80        0.996479       0.961404                   0.968358
4    50/50        0.996479       0.961404                   0.968358
5    80/20        0.996479       0.961404                   0.968358
6    20/80        0.986813       0.982456                   0.975824
7    50/50        0.986813       0.982456                   0.975824
8    80/20        0.986813       0.982456                   0.975824
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.991481                           0.972567   

   Test Accuracy Average  
0                0.96886  


I will now apply the Random Forest Classifier 

In [29]:
def random_forest(X_train, y_train, X_test, y_test):
    
    rf = RandomForestClassifier()

    param_grid = {"n_estimators": [50, 100, 200],
              "max_depth" : [1, 2, 3, 4, 5], 
              "min_samples_split" : [2, 4, 6, 10]
             }
    
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring="accuracy")
    best_grid_search = grid_search.fit(X_train, y_train)
    best_rf = grid_search.best_estimator_
    
    train_accuracy = best_rf.score(X_train, y_train)
    y_pred = best_rf.predict(X_test)
    
    test_accuracy = accuracy_score(y_test, y_pred)
    cross_validation = best_grid_search.best_score_

    return train_accuracy, test_accuracy, cross_validation

In [30]:
train_accuracy1_rf1, test_accuracy1_rf1, cross_validation1_rf1 = random_forest(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_rf1, test_accuracy2_rf1, cross_validation2_rf1 = random_forest(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_rf1, test_accuracy3_rf1, cross_validation3_rf1 = random_forest(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_rf1, test_accuracy4_rf1, cross_validation4_rf1 = random_forest(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_rf1, test_accuracy5_rf1, cross_validation5_rf1 = random_forest(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_rf1, test_accuracy6_rf1, cross_validation6_rf1 = random_forest(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_rf1, test_accuracy7_rf1, cross_validation7_rf1 = random_forest(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_rf1, test_accuracy8_rf1, cross_validation8_rf1 = random_forest(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_rf1, test_accuracy9_rf1, cross_validation9_rf1 = random_forest(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_rf1 = (train_accuracy1_rf1 + train_accuracy2_rf1 + train_accuracy3_rf1 + train_accuracy4_rf1 + train_accuracy5_rf1 + train_accuracy6_rf1 + train_accuracy7_rf1 + train_accuracy8_rf1 + train_accuracy9_rf1) / 9

cv_average_rf1 = (cross_validation1_rf1 + cross_validation2_rf1 + cross_validation3_rf1 + cross_validation4_rf1 + cross_validation5_rf1 + cross_validation6_rf1 + cross_validation7_rf1 + cross_validation8_rf1 + cross_validation9_rf1) / 9

test_average_rf1 = (test_accuracy1_rf1 + test_accuracy2_rf1 + test_accuracy3_rf1 + test_accuracy4_rf1 + test_accuracy5_rf1 + test_accuracy6_rf1 + test_accuracy7_rf1 + test_accuracy8_rf1 + test_accuracy9_rf1) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_rf1, train_accuracy2_rf1, train_accuracy3_rf1, train_accuracy4_rf1, train_accuracy5_rf1, train_accuracy6_rf1, train_accuracy7_rf1, train_accuracy8_rf1, train_accuracy9_rf1],
    "Test Accuracy": [test_accuracy1_rf1, test_accuracy2_rf1, test_accuracy3_rf1, test_accuracy4_rf1, test_accuracy5_rf1, test_accuracy6_rf1, test_accuracy7_rf1, test_accuracy8_rf1, test_accuracy9_rf1],
    "Cross Validation Accuracy": [cross_validation1_rf1, cross_validation2_rf1, cross_validation3_rf1, cross_validation4_rf1, cross_validation5_rf1, cross_validation6_rf1, cross_validation7_rf1, cross_validation8_rf1, cross_validation9_rf1]
})

averages_rf1 = pd.DataFrame({
    "Train Accuracy Average": [train_average_rf1], 
    "Cross Validation Accuracy Average": [cv_average_rf1],
    "Test Accuracy Average": [test_average_rf1]
})

print(results)
print(averages_rf1)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        1.000000       0.938596                   0.920158
1     50/50        1.000000       0.942982                   0.928854
2     80/20        0.991150       0.929825                   0.919763
3     20/80        0.992958       0.964912                   0.954261
4     50/50        0.996479       0.957895                   0.950752
5     80/20        0.992958       0.954386                   0.954261
6     20/80        0.991209       0.973684                   0.958242
7     50/50        0.991209       0.982456                   0.960440
8     80/20        0.991209       0.982456                   0.960440
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                 0.99413                           0.945241   

   Test Accuracy Average  
0               0.958577  


Now, I will perform cross-validation using Support Vector Machine 

In [32]:
def svm(X_train,y_train, X_test, y_test):
    
    svc = SVC()

    param_grid = {
    "C" : [1, 10, 100, 1000],
    "gamma" : [0.001,0.005,0.01,0.05,0.1,0.5,1,2], 
    "kernel" : ["linear", "poly", "rbf"], 
    "degree" : [2]
}

    grid_search = GridSearchCV(svc, param_grid, cv=5, scoring="accuracy")
    best_grid_search = grid_search.fit(X_train, y_train)
    best_svc = grid_search.best_estimator_

    train_accuracy = best_svc.score(X_train, y_train)
    y_pred = best_svc.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    cross_validation = best_grid_search.best_score_

    return train_accuracy, test_accuracy, cross_validation

In [33]:
train_accuracy1_svm1, test_accuracy1_svm1, cross_validation1_svm1 = svm(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_svm1, test_accuracy2_svm1, cross_validation2_svm1 = svm(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_svm1, test_accuracy3_svm1, cross_validation3_svm1 = svm(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_svm1, test_accuracy4_svm1, cross_validation4_svm1 = svm(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_svm1, test_accuracy5_svm1, cross_validation5_svm1 = svm(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_svm1, test_accuracy6_svm1, cross_validation6_svm1 = svm(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_svm1, test_accuracy7_svm1, cross_validation7_svm1 = svm(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_svm1, test_accuracy8_svm1, cross_validation8_svm1 = svm(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_svm1, test_accuracy9_svm1, cross_validation9_svm1 = svm(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_svm1 = (train_accuracy1_svm1 + train_accuracy2_svm1 + train_accuracy3_svm1 + train_accuracy4_svm1 + train_accuracy5_svm1 + train_accuracy6_svm1 + train_accuracy7_svm1 + train_accuracy8_svm1 + train_accuracy9_svm1) / 9

cv_average_svm1 = (cross_validation1_svm1 + cross_validation2_svm1 + cross_validation3_svm1 + cross_validation4_svm1 + cross_validation5_svm1 + cross_validation6_svm1 + cross_validation7_svm1 + cross_validation8_svm1 + cross_validation9_svm1) / 9

test_average_svm1 = (test_accuracy1_svm1 + test_accuracy2_svm1 + test_accuracy3_svm1 + test_accuracy4_svm1 + test_accuracy5_svm1 + test_accuracy6_svm1 + test_accuracy7_svm1 + test_accuracy8_svm1 + test_accuracy9_svm1) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_svm1, train_accuracy2_svm1, train_accuracy3_svm1, train_accuracy4_svm1, train_accuracy5_svm1, train_accuracy6_svm1, train_accuracy7_svm1, train_accuracy8_svm1, train_accuracy9_svm1],
    "Test Accuracy": [test_accuracy1_svm1, test_accuracy2_svm1, test_accuracy3_svm1, test_accuracy4_svm1, test_accuracy5_svm1, test_accuracy6_svm1, test_accuracy7_svm1, test_accuracy8_svm1, test_accuracy9_svm1],
    "Cross Validation Accuracy": [cross_validation1_svm1, cross_validation2_svm1, cross_validation3_svm1, cross_validation4_svm1, cross_validation5_svm1, cross_validation6_svm1, cross_validation7_svm1, cross_validation8_svm1, cross_validation9_svm1]
})

averages_svm1 = pd.DataFrame({
    "Train Accuracy Average": [train_average_svm1], 
    "Cross Validation Accuracy Average": [cv_average_svm1],
    "Test Accuracy Average": [test_average_svm1]
})

print(results)
print(averages_svm1)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        1.000000       0.962719                   0.982213
1     50/50        1.000000       0.962719                   0.982213
2     80/20        1.000000       0.962719                   0.982213
3     20/80        1.000000       0.961404                   0.975439
4     50/50        1.000000       0.961404                   0.975439
5     80/20        1.000000       0.961404                   0.975439
6     20/80        0.986813       0.964912                   0.980220
7     50/50        0.986813       0.964912                   0.980220
8     80/20        0.986813       0.964912                   0.980220
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.995604                           0.979291   

   Test Accuracy Average  
0               0.963012  


In [34]:
def decision_tree(X_train,y_train, X_test, y_test):
    
    dt = DecisionTreeClassifier()

    param_grid = {
        "max_depth" : [1, 2, 4]
}

    grid_search = GridSearchCV(dt, param_grid, cv=5, scoring="accuracy")
    best_grid_search = grid_search.fit(X_train, y_train)
    best_dt = grid_search.best_estimator_

    train_accuracy = best_dt.score(X_train, y_train)
    y_pred = best_dt.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    cross_validation = best_grid_search.best_score_

    return train_accuracy, test_accuracy, cross_validation

In [35]:
train_accuracy1_dt1, test_accuracy1_dt1, cross_validation1_dt1 = decision_tree(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_dt1, test_accuracy2_dt1, cross_validation2_dt1 = decision_tree(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_dt1, test_accuracy3_dt1, cross_validation3_dt1 = decision_tree(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_dt1, test_accuracy4_dt1, cross_validation4_dt1 = decision_tree(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_dt1, test_accuracy5_dt1, cross_validation5_dt1 = decision_tree(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_dt1, test_accuracy6_dt1, cross_validation6_dt1 = decision_tree(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_dt1, test_accuracy7_dt1, cross_validation7_dt1 = decision_tree(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_dt1, test_accuracy8_dt1, cross_validation8_dt1 = decision_tree(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_dt1, test_accuracy9_dt1, cross_validation9_dt1 = decision_tree(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_dt1 = (train_accuracy1_dt1 + train_accuracy2_dt1 + train_accuracy3_dt1 + train_accuracy4_dt1 + train_accuracy5_dt1 + train_accuracy6_dt1 + train_accuracy7_dt1 + train_accuracy8_dt1 + train_accuracy9_dt1) / 9

cv_average_dt1 = (cross_validation1_dt1 + cross_validation2_dt1 + cross_validation3_dt1 + cross_validation4_dt1 + cross_validation5_dt1 + cross_validation6_dt1 + cross_validation7_dt1 + cross_validation8_dt1 + cross_validation9_dt1) / 9

test_average_dt1 = (test_accuracy1_dt1 + test_accuracy2_dt1 + test_accuracy3_dt1 + test_accuracy4_dt1 + test_accuracy5_dt1 + test_accuracy6_dt1 + test_accuracy7_dt1 + test_accuracy8_dt1 + test_accuracy9_dt1) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_dt1, train_accuracy2_dt1, train_accuracy3_dt1, train_accuracy4_dt1, train_accuracy5_dt1, train_accuracy6_dt1, train_accuracy7_dt1, train_accuracy8_dt1, train_accuracy9_dt1],
    "Test Accuracy": [test_accuracy1_dt1, test_accuracy2_dt1, test_accuracy3_dt1, test_accuracy4_dt1, test_accuracy5_dt1, test_accuracy6_dt1, test_accuracy7_dt1, test_accuracy8_dt1, test_accuracy9_dt1],
    "Cross Validation Accuracy": [cross_validation1_dt1, cross_validation2_dt1, cross_validation3_dt1, cross_validation4_dt1, cross_validation5_dt1, cross_validation6_dt1, cross_validation7_dt1, cross_validation8_dt1, cross_validation9_dt1]
})

averages_dt1 = pd.DataFrame({
    "Train Accuracy Average": [train_average_dt1], 
    "Cross Validation Accuracy Average": [cv_average_dt1],
    "Test Accuracy Average": [test_average_dt1]
})

print(results)
print(averages_dt1)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        1.000000       0.923246                   0.866798
1     50/50        1.000000       0.921053                   0.866403
2     80/20        1.000000       0.918860                   0.849012
3     20/80        0.926056       0.908772                   0.908459
4     50/50        0.926056       0.908772                   0.908459
5     80/20        0.926056       0.908772                   0.908459
6     20/80        0.980220       0.956140                   0.927473
7     50/50        0.980220       0.956140                   0.927473
8     80/20        0.980220       0.956140                   0.920879
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.968759                           0.898157   

   Test Accuracy Average  
0               0.928655  


### Neural Nets: Multi-Layer Perceptron

In [37]:
def multi_layer_perceptron(X_train, y_train, X_test, y_test):
    
    mlp = MLPClassifier()

    param_grid = {
        "hidden_layer_sizes" : [(100, 100, 100), (150, 150, 150), (200, 200, 200)]
        }
    
    grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring="accuracy")
    best_grid_search = grid_search.fit(X_train, y_train)
    best_mlp = grid_search.best_estimator_
    
    train_accuracy = best_mlp.score(X_train, y_train)
    y_pred = best_mlp.predict(X_test)
    
    test_accuracy = accuracy_score(y_test, y_pred)
    cross_validation = best_grid_search.best_score_

    return train_accuracy, test_accuracy, cross_validation

In [38]:
train_accuracy1_mlp1, test_accuracy1_mlp1, cross_validation1_mlp1 = multi_layer_perceptron(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_mlp1, test_accuracy2_mlp1, cross_validation2_mlp1 = multi_layer_perceptron(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_mlp1, test_accuracy3_mlp1, cross_validation3_mlp1 = multi_layer_perceptron(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_mlp1, test_accuracy4_mlp1, cross_validation4_mlp1 = multi_layer_perceptron(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_mlp1, test_accuracy5_mlp1, cross_validation5_mlp1 = multi_layer_perceptron(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_mlp1, test_accuracy6_mlp1, cross_validation6_mlp1 = multi_layer_perceptron(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_mlp1, test_accuracy7_mlp1, cross_validation7_mlp1 = multi_layer_perceptron(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_mlp1, test_accuracy8_mlp1, cross_validation8_mlp1 = multi_layer_perceptron(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_mlp1, test_accuracy9_mlp1, cross_validation9_mlp1 = multi_layer_perceptron(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_mlp1 = (train_accuracy1_mlp1 + train_accuracy2_mlp1 + train_accuracy3_mlp1 + train_accuracy4_mlp1 + train_accuracy5_mlp1 + train_accuracy6_mlp1 + train_accuracy7_mlp1 + train_accuracy8_mlp1 + train_accuracy9_mlp1) / 9

cv_average_mlp1 = (cross_validation1_mlp1 + cross_validation2_mlp1 + cross_validation3_mlp1 + cross_validation4_mlp1 + cross_validation5_mlp1 + cross_validation6_mlp1 + cross_validation7_mlp1 + cross_validation8_mlp1 + cross_validation9_mlp1) / 9

test_average_mlp1 = (test_accuracy1_mlp1 + test_accuracy2_mlp1 + test_accuracy3_mlp1 + test_accuracy4_mlp1 + test_accuracy5_mlp1 + test_accuracy6_mlp1 + test_accuracy7_mlp1 + test_accuracy8_mlp1 + test_accuracy9_mlp1) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_mlp1, train_accuracy2_mlp1, train_accuracy3_mlp1, train_accuracy4_mlp1, train_accuracy5_mlp1, train_accuracy6_mlp1, train_accuracy7_mlp1, train_accuracy8_mlp1, train_accuracy9_mlp1],
    "Test Accuracy": [test_accuracy1_mlp1, test_accuracy2_mlp1, test_accuracy3_mlp1, test_accuracy4_mlp1, test_accuracy5_mlp1, test_accuracy6_mlp1, test_accuracy7_mlp1, test_accuracy8_mlp1, test_accuracy9_mlp1],
    "Cross Validation Accuracy": [cross_validation1_mlp1, cross_validation2_mlp1, cross_validation3_mlp1, cross_validation4_mlp1, cross_validation5_mlp1, cross_validation6_mlp1, cross_validation7_mlp1, cross_validation8_mlp1, cross_validation9_mlp1]
})

averages_mlp1 = pd.DataFrame({
    "Train Accuracy Average": [train_average_mlp1], 
    "Cross Validation Accuracy Average": [cv_average_mlp1],
    "Test Accuracy Average": [test_average_mlp1]
})

print(results)
print(averages_mlp1)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80             1.0       0.960526                   0.982609
1     50/50             1.0       0.962719                   0.973518
2     80/20             1.0       0.958333                   0.964427
3     20/80             1.0       0.964912                   0.978947
4     50/50             1.0       0.961404                   0.978885
5     80/20             1.0       0.968421                   0.982456
6     20/80             1.0       0.956140                   0.971429
7     50/50             1.0       0.964912                   0.975824
8     80/20             1.0       0.973684                   0.971429
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                     1.0                           0.975503   

   Test Accuracy Average  
0                0.96345  


### Now I will use the Parkinsons Dataset

In [40]:
parkinsons_data.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [41]:
X = parkinsons_data.drop(columns=["name", "status"])
y = parkinsons_data["status"]

In [42]:
X_train_20, X_test_80, y_train_20, y_test_80 = train_test_split(X, y, test_size=0.80, random_state=11) #20/80 split
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X, y, test_size=0.50, random_state=11) #50/50 split
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, y, test_size=0.20, random_state=11) #80/20 split

In [43]:
#Scale Data
scalar = StandardScaler()
X_train_20_s = scalar.fit_transform(X_train_20)
X_test_80_s = scalar.transform(X_test_80)

X_train_50_s = scalar.fit_transform(X_train_50)
X_test_50_s = scalar.transform(X_test_50)

X_train_80_s = scalar.fit_transform(X_train_80)
X_test_20_s = scalar.transform(X_test_20)

### Logistic Regression

In [45]:
train_accuracy1_lr2, test_accuracy1_lr2, cross_validation1_lr2 = logistic_regression(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_lr2, test_accuracy2_lr2, cross_validation2_lr2 = logistic_regression(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_lr2, test_accuracy3_lr2, cross_validation3_lr2 = logistic_regression(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_lr2, test_accuracy4_lr2, cross_validation4_lr2 = logistic_regression(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_lr2, test_accuracy5_lr2, cross_validation5_lr2 = logistic_regression(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_lr2, test_accuracy6_lr2, cross_validation6_lr2 = logistic_regression(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_lr2, test_accuracy7_lr2, cross_validation7_lr2 = logistic_regression(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_lr2, test_accuracy8_lr2, cross_validation8_lr2 = logistic_regression(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_lr2, test_accuracy9_lr2, cross_validation9_lr2 = logistic_regression(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_lr2 = (train_accuracy1_lr2 + train_accuracy2_lr2 + train_accuracy3_lr2 + train_accuracy4_lr2 + train_accuracy5_lr2 + train_accuracy6_lr2 + train_accuracy7_lr2 + train_accuracy8_lr2 + train_accuracy9_lr2) / 9

cv_average_lr2 = (cross_validation1_lr2 + cross_validation2_lr2 + cross_validation3_lr2 + cross_validation4_lr2 + cross_validation5_lr2 + cross_validation6_lr2 + cross_validation7_lr2 + cross_validation8_lr2 + cross_validation9_lr2) / 9

test_average_lr2 = (test_accuracy1_lr2 + test_accuracy2_lr2 + test_accuracy3_lr2 + test_accuracy4_lr2 + test_accuracy5_lr2 + test_accuracy6_lr2 + test_accuracy7_lr2 + test_accuracy8_lr2 + test_accuracy9_lr2) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_lr2, train_accuracy2_lr2, train_accuracy3_lr2, train_accuracy4_lr2, train_accuracy5_lr2, train_accuracy6_lr2, train_accuracy7_lr2, train_accuracy8_lr2, train_accuracy9_lr2],
    "Test Accuracy": [test_accuracy1_lr2, test_accuracy2_lr2, test_accuracy3_lr2, test_accuracy4_lr2, test_accuracy5_lr2, test_accuracy6_lr2, test_accuracy7_lr2, test_accuracy8_lr2, test_accuracy9_lr2],
    "Cross Validation Accuracy": [cross_validation1_lr2, cross_validation2_lr2, cross_validation3_lr2, cross_validation4_lr2, cross_validation5_lr2, cross_validation6_lr2, cross_validation7_lr2, cross_validation8_lr2, cross_validation9_lr2]
})

averages_lr2 = pd.DataFrame({
    "Train Accuracy Average": [train_average_lr2], 
    "Cross Validation Accuracy Average": [cv_average_lr2],
    "Test Accuracy Average": [test_average_lr2]
})

print(results)
print(averages_lr2)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        0.948718       0.846154                   0.871429
1     50/50        0.948718       0.846154                   0.871429
2     80/20        0.948718       0.846154                   0.871429
3     20/80        0.876289       0.826531                   0.845263
4     50/50        0.876289       0.826531                   0.845263
5     80/20        0.876289       0.826531                   0.845263
6     20/80        0.897436       0.769231                   0.826411
7     50/50        0.897436       0.769231                   0.826411
8     80/20        0.897436       0.769231                   0.826411
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.907481                           0.847701   

   Test Accuracy Average  
0               0.813972  


### Random Forests

In [47]:
train_accuracy1_rf2, test_accuracy1_rf2, cross_validation1_rf2 = random_forest(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_rf2, test_accuracy2_rf2, cross_validation2_rf2 = random_forest(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_rf2, test_accuracy3_rf2, cross_validation3_rf2 = random_forest(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_rf2, test_accuracy4_rf2, cross_validation4_rf2 = random_forest(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_rf2, test_accuracy5_rf2, cross_validation5_rf2 = random_forest(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_rf2, test_accuracy6_rf2, cross_validation6_rf2 = random_forest(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_rf2, test_accuracy7_rf2, cross_validation7_rf2 = random_forest(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_rf2, test_accuracy8_rf2, cross_validation8_rf2 = random_forest(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_rf2, test_accuracy9_rf2, cross_validation9_rf2 = random_forest(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_rf2 = (train_accuracy1_rf2 + train_accuracy2_rf2 + train_accuracy3_rf2 + train_accuracy4_rf2 + train_accuracy5_rf2 + train_accuracy6_rf2 + train_accuracy7_rf2 + train_accuracy8_rf2 + train_accuracy9_rf2) / 9

cv_average_rf2 = (cross_validation1_rf2 + cross_validation2_rf2 + cross_validation3_rf2 + cross_validation4_rf2 + cross_validation5_rf2 + cross_validation6_rf2 + cross_validation7_rf2 + cross_validation8_rf2 + cross_validation9_rf2) / 9

test_average_rf2 = (test_accuracy1_rf2 + test_accuracy2_rf2 + test_accuracy3_rf2 + test_accuracy4_rf2 + test_accuracy5_rf2 + test_accuracy6_rf2 + test_accuracy7_rf2 + test_accuracy8_rf2 + test_accuracy9_rf2) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_rf2, train_accuracy2_rf2, train_accuracy3_rf2, train_accuracy4_rf2, train_accuracy5_rf2, train_accuracy6_rf2, train_accuracy7_rf2, train_accuracy8_rf2, train_accuracy9_rf2],
    "Test Accuracy": [test_accuracy1_rf2, test_accuracy2_rf2, test_accuracy3_rf2, test_accuracy4_rf2, test_accuracy5_rf2, test_accuracy6_rf2, test_accuracy7_rf2, test_accuracy8_rf2, test_accuracy9_rf2],
    "Cross Validation Accuracy": [cross_validation1_rf2, cross_validation2_rf2, cross_validation3_rf2, cross_validation4_rf2, cross_validation5_rf2, cross_validation6_rf2, cross_validation7_rf2, cross_validation8_rf2, cross_validation9_rf2]
})

averages_rf2 = pd.DataFrame({
    "Train Accuracy Average": [train_average_rf2], 
    "Cross Validation Accuracy Average": [cv_average_rf2],
    "Test Accuracy Average": [test_average_rf2]
})

print(results)
print(averages_rf2)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        0.871795       0.846154                   0.871429
1     50/50        0.897436       0.852564                   0.871429
2     80/20        0.974359       0.858974                   0.871429
3     20/80        0.927835       0.867347                   0.865263
4     50/50        0.917526       0.867347                   0.875263
5     80/20        0.979381       0.918367                   0.865263
6     20/80        0.974359       0.897436                   0.903629
7     50/50        1.000000       0.948718                   0.909879
8     80/20        1.000000       0.897436                   0.916532
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.949188                           0.883346   

   Test Accuracy Average  
0               0.883816  


### SVM

In [49]:
train_accuracy1_svm2, test_accuracy1_svm2, cross_validation1_svm2 = svm(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_svm2, test_accuracy2_svm2, cross_validation2_svm2 = svm(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_svm2, test_accuracy3_svm2, cross_validation3_svm2 = svm(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_svm2, test_accuracy4_svm2, cross_validation4_svm2 = svm(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_svm2, test_accuracy5_svm2, cross_validation5_svm2 = svm(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_svm2, test_accuracy6_svm2, cross_validation6_svm2 = svm(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_svm2, test_accuracy7_svm2, cross_validation7_svm2 = svm(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_svm2, test_accuracy8_svm2, cross_validation8_svm2 = svm(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_svm2, test_accuracy9_svm2, cross_validation9_svm2 = svm(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_svm2 = (train_accuracy1_svm2 + train_accuracy2_svm2 + train_accuracy3_svm2 + train_accuracy4_svm2 + train_accuracy5_svm2 + train_accuracy6_svm2 + train_accuracy7_svm2 + train_accuracy8_svm2 + train_accuracy9_svm2) / 9

cv_average_svm2 = (cross_validation1_svm2 + cross_validation2_svm2 + cross_validation3_svm2 + cross_validation4_svm2 + cross_validation5_svm2 + cross_validation6_svm2 + cross_validation7_svm2 + cross_validation8_svm2 + cross_validation9_svm2) / 9

test_average_svm2 = (test_accuracy1_svm2 + test_accuracy2_svm2 + test_accuracy3_svm2 + test_accuracy4_svm2 + test_accuracy5_svm2 + test_accuracy6_svm2 + test_accuracy7_svm2 + test_accuracy8_svm2 + test_accuracy9_svm2) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_svm2, train_accuracy2_svm2, train_accuracy3_svm2, train_accuracy4_svm2, train_accuracy5_svm2, train_accuracy6_svm2, train_accuracy7_svm2, train_accuracy8_svm2, train_accuracy9_svm2],
    "Test Accuracy": [test_accuracy1_svm2, test_accuracy2_svm2, test_accuracy3_svm2, test_accuracy4_svm2, test_accuracy5_svm2, test_accuracy6_svm2, test_accuracy7_svm2, test_accuracy8_svm2, test_accuracy9_svm2],
    "Cross Validation Accuracy": [cross_validation1_svm2, cross_validation2_svm2, cross_validation3_svm2, cross_validation4_svm2, cross_validation5_svm2, cross_validation6_svm2, cross_validation7_svm2, cross_validation8_svm2, cross_validation9_svm2]
})

averages_svm2 = pd.DataFrame({
    "Train Accuracy Average": [train_average_svm2], 
    "Cross Validation Accuracy Average": [cv_average_svm2],
    "Test Accuracy Average": [test_average_svm2]
})

print(results)
print(averages_svm2)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        0.897436       0.871795                   0.896429
1     50/50        0.897436       0.871795                   0.896429
2     80/20        0.897436       0.871795                   0.896429
3     20/80        0.989691       0.928571                   0.896316
4     50/50        0.989691       0.928571                   0.896316
5     80/20        0.989691       0.928571                   0.896316
6     20/80        1.000000       1.000000                   0.929032
7     50/50        1.000000       1.000000                   0.929032
8     80/20        1.000000       1.000000                   0.929032
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.962376                           0.907259   

   Test Accuracy Average  
0               0.933455  


### Decision Tree

In [51]:
train_accuracy1_dt2, test_accuracy1_dt2, cross_validation1_dt2 = decision_tree(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_dt2, test_accuracy2_dt2, cross_validation2_dt2 = decision_tree(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_dt2, test_accuracy3_dt2, cross_validation3_dt2 = decision_tree(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_dt2, test_accuracy4_dt2, cross_validation4_dt2 = decision_tree(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_dt2, test_accuracy5_dt2, cross_validation5_dt2 = decision_tree(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_dt2, test_accuracy6_dt2, cross_validation6_dt2 = decision_tree(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_dt2, test_accuracy7_dt2, cross_validation7_dt2 = decision_tree(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_dt2, test_accuracy8_dt2, cross_validation8_dt2 = decision_tree(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_dt2, test_accuracy9_dt2, cross_validation9_dt2 = decision_tree(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_dt2 = (train_accuracy1_dt2 + train_accuracy2_dt2 + train_accuracy3_dt2 + train_accuracy4_dt2 + train_accuracy5_dt2 + train_accuracy6_dt2 + train_accuracy7_dt2 + train_accuracy8_dt2 + train_accuracy9_dt2) / 9

cv_average_dt2 = (cross_validation1_dt2 + cross_validation2_dt2 + cross_validation3_dt2 + cross_validation4_dt2 + cross_validation5_dt2 + cross_validation6_dt2 + cross_validation7_dt2 + cross_validation8_dt2 + cross_validation9_dt2) / 9

test_average_dt2 = (test_accuracy1_dt2 + test_accuracy2_dt2 + test_accuracy3_dt2 + test_accuracy4_dt2 + test_accuracy5_dt2 + test_accuracy6_dt2 + test_accuracy7_dt2 + test_accuracy8_dt2 + test_accuracy9_dt2) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_dt2, train_accuracy2_dt2, train_accuracy3_dt2, train_accuracy4_dt2, train_accuracy5_dt2, train_accuracy6_dt2, train_accuracy7_dt2, train_accuracy8_dt2, train_accuracy9_dt2],
    "Test Accuracy": [test_accuracy1_dt2, test_accuracy2_dt2, test_accuracy3_dt2, test_accuracy4_dt2, test_accuracy5_dt2, test_accuracy6_dt2, test_accuracy7_dt2, test_accuracy8_dt2, test_accuracy9_dt2],
    "Cross Validation Accuracy": [cross_validation1_dt2, cross_validation2_dt2, cross_validation3_dt2, cross_validation4_dt2, cross_validation5_dt2, cross_validation6_dt2, cross_validation7_dt2, cross_validation8_dt2, cross_validation9_dt2]
})

averages_dt2 = pd.DataFrame({
    "Train Accuracy Average": [train_average_dt2], 
    "Cross Validation Accuracy Average": [cv_average_dt2],
    "Test Accuracy Average": [test_average_dt2]
})

print(results)
print(averages_dt2)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        1.000000       0.858974                   0.775000
1     50/50        0.923077       0.826923                   0.714286
2     80/20        0.923077       0.826923                   0.771429
3     20/80        0.855670       0.877551                   0.803684
4     50/50        0.855670       0.877551                   0.803684
5     80/20        0.855670       0.877551                   0.793158
6     20/80        0.993590       0.871795                   0.871573
7     50/50        0.871795       0.846154                   0.858669
8     80/20        0.993590       0.871795                   0.871573
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.919127                           0.807006   

   Test Accuracy Average  
0               0.859469  


### Multi-Layer Perceptron

In [53]:
train_accuracy1_mlp2, test_accuracy1_mlp2, cross_validation1_mlp2 = multi_layer_perceptron(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_mlp2, test_accuracy2_mlp2, cross_validation2_mlp2 = multi_layer_perceptron(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_mlp2, test_accuracy3_mlp2, cross_validation3_mlp2 = multi_layer_perceptron(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_mlp2, test_accuracy4_mlp2, cross_validation4_mlp2 = multi_layer_perceptron(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_mlp2, test_accuracy5_mlp2, cross_validation5_mlp2 = multi_layer_perceptron(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_mlp2, test_accuracy6_mlp2, cross_validation6_mlp2 = multi_layer_perceptron(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_mlp2, test_accuracy7_mlp2, cross_validation7_mlp2 = multi_layer_perceptron(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_mlp2, test_accuracy8_mlp2, cross_validation8_mlp2 = multi_layer_perceptron(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_mlp2, test_accuracy9_mlp2, cross_validation9_mlp2 = multi_layer_perceptron(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_mlp2 = (train_accuracy1_mlp2 + train_accuracy2_mlp2 + train_accuracy3_mlp2 + train_accuracy4_mlp2 + train_accuracy5_mlp2 + train_accuracy6_mlp2 + train_accuracy7_mlp2 + train_accuracy8_mlp2 + train_accuracy9_mlp2) / 9

cv_average_mlp2 = (cross_validation1_mlp2 + cross_validation2_mlp2 + cross_validation3_mlp2 + cross_validation4_mlp2 + cross_validation5_mlp2 + cross_validation6_mlp2 + cross_validation7_mlp2 + cross_validation8_mlp2 + cross_validation9_mlp2) / 9

test_average_mlp2 = (test_accuracy1_mlp2 + test_accuracy2_mlp2 + test_accuracy3_mlp2 + test_accuracy4_mlp2 + test_accuracy5_mlp2 + test_accuracy6_mlp2 + test_accuracy7_mlp2 + test_accuracy8_mlp2 + test_accuracy9_mlp2) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_mlp2, train_accuracy2_mlp2, train_accuracy3_mlp2, train_accuracy4_mlp2, train_accuracy5_mlp2, train_accuracy6_mlp2, train_accuracy7_mlp2, train_accuracy8_mlp2, train_accuracy9_mlp2],
    "Test Accuracy": [test_accuracy1_mlp2, test_accuracy2_mlp2, test_accuracy3_mlp2, test_accuracy4_mlp2, test_accuracy5_mlp2, test_accuracy6_mlp2, test_accuracy7_mlp2, test_accuracy8_mlp2, test_accuracy9_mlp2],
    "Cross Validation Accuracy": [cross_validation1_mlp2, cross_validation2_mlp2, cross_validation3_mlp2, cross_validation4_mlp2, cross_validation5_mlp2, cross_validation6_mlp2, cross_validation7_mlp2, cross_validation8_mlp2, cross_validation9_mlp2]
})

averages_mlp2 = pd.DataFrame({
    "Train Accuracy Average": [train_average_mlp2], 
    "Cross Validation Accuracy Average": [cv_average_mlp2],
    "Test Accuracy Average": [test_average_mlp2]
})

print(results)
print(averages_mlp2)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80             1.0       0.865385                   0.850000
1     50/50             1.0       0.814103                   0.850000
2     80/20             1.0       0.865385                   0.846429
3     20/80             1.0       0.887755                   0.897368
4     50/50             1.0       0.877551                   0.897368
5     80/20             1.0       0.897959                   0.907368
6     20/80             1.0       0.923077                   0.929234
7     50/50             1.0       0.923077                   0.935484
8     80/20             1.0       0.923077                   0.929032
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                     1.0                           0.893587   

   Test Accuracy Average  
0               0.886374  


### Now I will use the Heart Dataset 

In [55]:
heart_data.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_electrocardiographic,max_heart_rate,angina,oldpeak,slope,major_vessels,thal,heart_disease
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,2
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,1
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,2
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,1
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,1


In [56]:
X = heart_data.drop(columns=["heart_disease"])

In [57]:
y = heart_data["heart_disease"]

In [58]:
X_train_20, X_test_80, y_train_20, y_test_80 = train_test_split(X, y, test_size=0.80, random_state=11) #20/80 split
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X, y, test_size=0.50, random_state=11) #50/50 split
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, y, test_size=0.20, random_state=11) #80/20 split

In [59]:
#Scale Data
scalar = StandardScaler()
X_train_20_s = scalar.fit_transform(X_train_20)
X_test_80_s = scalar.transform(X_test_80)

X_train_50_s = scalar.fit_transform(X_train_50)
X_test_50_s = scalar.transform(X_test_50)

X_train_80_s = scalar.fit_transform(X_train_80)
X_test_20_s = scalar.transform(X_test_20)

### Logistic Regression 

In [61]:
train_accuracy1_lr3, test_accuracy1_lr3, cross_validation1_lr3 = logistic_regression(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_lr3, test_accuracy2_lr3, cross_validation2_lr3 = logistic_regression(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_lr3, test_accuracy3_lr3, cross_validation3_lr3 = logistic_regression(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_lr3, test_accuracy4_lr3, cross_validation4_lr3 = logistic_regression(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_lr3, test_accuracy5_lr3, cross_validation5_lr3 = logistic_regression(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_lr3, test_accuracy6_lr3, cross_validation6_lr3 = logistic_regression(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_lr3, test_accuracy7_lr3, cross_validation7_lr3 = logistic_regression(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_lr3, test_accuracy8_lr3, cross_validation8_lr3 = logistic_regression(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_lr3, test_accuracy9_lr3, cross_validation9_lr3 = logistic_regression(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_lr3 = (train_accuracy1_lr3 + train_accuracy2_lr3 + train_accuracy3_lr3 + train_accuracy4_lr3 + train_accuracy5_lr3 + train_accuracy6_lr3 + train_accuracy7_lr3 + train_accuracy8_lr3 + train_accuracy9_lr3) / 9

cv_average_lr3 = (cross_validation1_lr3 + cross_validation2_lr3 + cross_validation3_lr3 + cross_validation4_lr3 + cross_validation5_lr3 + cross_validation6_lr3 + cross_validation7_lr3 + cross_validation8_lr3 + cross_validation9_lr3) / 9

test_average_lr3 = (test_accuracy1_lr3 + test_accuracy2_lr3 + test_accuracy3_lr3 + test_accuracy4_lr3 + test_accuracy5_lr3 + test_accuracy6_lr3 + test_accuracy7_lr3 + test_accuracy8_lr3 + test_accuracy9_lr3) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_lr3, train_accuracy2_lr3, train_accuracy3_lr3, train_accuracy4_lr3, train_accuracy5_lr3, train_accuracy6_lr3, train_accuracy7_lr3, train_accuracy8_lr3, train_accuracy9_lr3],
    "Test Accuracy": [test_accuracy1_lr3, test_accuracy2_lr3, test_accuracy3_lr3, test_accuracy4_lr3, test_accuracy5_lr3, test_accuracy6_lr3, test_accuracy7_lr3, test_accuracy8_lr3, test_accuracy9_lr3],
    "Cross Validation Accuracy": [cross_validation1_lr3, cross_validation2_lr3, cross_validation3_lr3, cross_validation4_lr3, cross_validation5_lr3, cross_validation6_lr3, cross_validation7_lr3, cross_validation8_lr3, cross_validation9_lr3]
})

averages_lr3 = pd.DataFrame({
    "Train Accuracy Average": [train_average_lr3], 
    "Cross Validation Accuracy Average": [cv_average_lr3],
    "Test Accuracy Average": [test_average_lr3]
})

print(results)
print(averages_lr3)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        0.925926       0.759259                   0.778182
1     50/50        0.925926       0.759259                   0.778182
2     80/20        0.925926       0.759259                   0.778182
3     20/80        0.844444       0.807407                   0.844444
4     50/50        0.844444       0.807407                   0.844444
5     80/20        0.844444       0.807407                   0.844444
6     20/80        0.847222       0.870370                   0.819450
7     50/50        0.847222       0.870370                   0.819450
8     80/20        0.847222       0.870370                   0.819450
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.872531                           0.814026   

   Test Accuracy Average  
0               0.812346  


### Random Forests

In [63]:
train_accuracy1_rf3, test_accuracy1_rf3, cross_validation1_rf3 = random_forest(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_rf3, test_accuracy2_rf3, cross_validation2_rf3 = random_forest(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_rf3, test_accuracy3_rf3, cross_validation3_rf3 = random_forest(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_rf3, test_accuracy4_rf3, cross_validation4_rf3 = random_forest(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_rf3, test_accuracy5_rf3, cross_validation5_rf3 = random_forest(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_rf3, test_accuracy6_rf3, cross_validation6_rf3 = random_forest(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_rf3, test_accuracy7_rf3, cross_validation7_rf3 = random_forest(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_rf3, test_accuracy8_rf3, cross_validation8_rf3 = random_forest(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_rf3, test_accuracy9_rf3, cross_validation9_rf3 = random_forest(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_rf3 = (train_accuracy1_rf3 + train_accuracy2_rf3 + train_accuracy3_rf3 + train_accuracy4_rf3 + train_accuracy5_rf3 + train_accuracy6_rf3 + train_accuracy7_rf3 + train_accuracy8_rf3 + train_accuracy9_rf3) / 9

cv_average_rf3 = (cross_validation1_rf3 + cross_validation2_rf3 + cross_validation3_rf3 + cross_validation4_rf3 + cross_validation5_rf3 + cross_validation6_rf3 + cross_validation7_rf3 + cross_validation8_rf3 + cross_validation9_rf3) / 9

test_average_rf3 = (test_accuracy1_rf3 + test_accuracy2_rf3 + test_accuracy3_rf3 + test_accuracy4_rf3 + test_accuracy5_rf3 + test_accuracy6_rf3 + test_accuracy7_rf3 + test_accuracy8_rf3 + test_accuracy9_rf3) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_rf3, train_accuracy2_rf3, train_accuracy3_rf3, train_accuracy4_rf3, train_accuracy5_rf3, train_accuracy6_rf3, train_accuracy7_rf3, train_accuracy8_rf3, train_accuracy9_rf3],
    "Test Accuracy": [test_accuracy1_rf3, test_accuracy2_rf3, test_accuracy3_rf3, test_accuracy4_rf3, test_accuracy5_rf3, test_accuracy6_rf3, test_accuracy7_rf3, test_accuracy8_rf3, test_accuracy9_rf3],
    "Cross Validation Accuracy": [cross_validation1_rf3, cross_validation2_rf3, cross_validation3_rf3, cross_validation4_rf3, cross_validation5_rf3, cross_validation6_rf3, cross_validation7_rf3, cross_validation8_rf3, cross_validation9_rf3]
})

averages_rf3 = pd.DataFrame({
    "Train Accuracy Average": [train_average_rf3], 
    "Cross Validation Accuracy Average": [cv_average_rf3],
    "Test Accuracy Average": [test_average_rf3]
})

print(results)
print(averages_rf3)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        0.981481       0.763889                   0.870909
1     50/50        0.944444       0.754630                   0.870909
2     80/20        0.944444       0.745370                   0.870909
3     20/80        0.874074       0.837037                   0.851852
4     50/50        0.866667       0.792593                   0.851852
5     80/20        0.851852       0.822222                   0.851852
6     20/80        0.875000       0.888889                   0.851797
7     50/50        0.851852       0.907407                   0.856342
8     80/20        0.912037       0.851852                   0.856342
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.900206                           0.859196   

   Test Accuracy Average  
0                0.81821  


### SVM

In [65]:
train_accuracy1_svm3, test_accuracy1_svm3, cross_validation1_svm3 = svm(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_svm3, test_accuracy2_svm3, cross_validation2_svm3 = svm(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_svm3, test_accuracy3_svm3, cross_validation3_svm3 = svm(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_svm3, test_accuracy4_svm3, cross_validation4_svm3 = svm(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_svm3, test_accuracy5_svm3, cross_validation5_svm3 = svm(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_svm3, test_accuracy6_svm3, cross_validation6_svm3 = svm(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_svm3, test_accuracy7_svm3, cross_validation7_svm3 = svm(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_svm3, test_accuracy8_svm3, cross_validation8_svm3 = svm(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_svm3, test_accuracy9_svm3, cross_validation9_svm3 = svm(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_svm3 = (train_accuracy1_svm3 + train_accuracy2_svm3 + train_accuracy3_svm3 + train_accuracy4_svm3 + train_accuracy5_svm3 + train_accuracy6_svm3 + train_accuracy7_svm3 + train_accuracy8_svm3 + train_accuracy9_svm3) / 9

cv_average_svm3 = (cross_validation1_svm3 + cross_validation2_svm3 + cross_validation3_svm3 + cross_validation4_svm3 + cross_validation5_svm3 + cross_validation6_svm3 + cross_validation7_svm3 + cross_validation8_svm3 + cross_validation9_svm3) / 9

test_average_svm3 = (test_accuracy1_svm3 + test_accuracy2_svm3 + test_accuracy3_svm3 + test_accuracy4_svm3 + test_accuracy5_svm3 + test_accuracy6_svm3 + test_accuracy7_svm3 + test_accuracy8_svm3 + test_accuracy9_svm3) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_svm3, train_accuracy2_svm3, train_accuracy3_svm3, train_accuracy4_svm3, train_accuracy5_svm3, train_accuracy6_svm3, train_accuracy7_svm3, train_accuracy8_svm3, train_accuracy9_svm3],
    "Test Accuracy": [test_accuracy1_svm3, test_accuracy2_svm3, test_accuracy3_svm3, test_accuracy4_svm3, test_accuracy5_svm3, test_accuracy6_svm3, test_accuracy7_svm3, test_accuracy8_svm3, test_accuracy9_svm3],
    "Cross Validation Accuracy": [cross_validation1_svm3, cross_validation2_svm3, cross_validation3_svm3, cross_validation4_svm3, cross_validation5_svm3, cross_validation6_svm3, cross_validation7_svm3, cross_validation8_svm3, cross_validation9_svm3]
})

averages_svm3 = pd.DataFrame({
    "Train Accuracy Average": [train_average_svm3], 
    "Cross Validation Accuracy Average": [cv_average_svm3],
    "Test Accuracy Average": [test_average_svm3]
})

print(results)
print(averages_svm3)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        0.907407       0.800926                   0.870909
1     50/50        0.907407       0.800926                   0.870909
2     80/20        0.907407       0.800926                   0.870909
3     20/80        0.866667       0.851852                   0.844444
4     50/50        0.866667       0.851852                   0.844444
5     80/20        0.866667       0.851852                   0.844444
6     20/80        0.851852       0.870370                   0.832981
7     50/50        0.851852       0.870370                   0.832981
8     80/20        0.851852       0.870370                   0.832981
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.875309                           0.849445   

   Test Accuracy Average  
0               0.841049  


### Decision Tree

In [67]:
train_accuracy1_dt3, test_accuracy1_dt3, cross_validation1_dt3 = decision_tree(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_dt3, test_accuracy2_dt3, cross_validation2_dt3 = decision_tree(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_dt3, test_accuracy3_dt3, cross_validation3_dt3 = decision_tree(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_dt3, test_accuracy4_dt3, cross_validation4_dt3 = decision_tree(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_dt3, test_accuracy5_dt3, cross_validation5_dt3 = decision_tree(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_dt3, test_accuracy6_dt3, cross_validation6_dt3 = decision_tree(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_dt3, test_accuracy7_dt3, cross_validation7_dt3 = decision_tree(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_dt3, test_accuracy8_dt3, cross_validation8_dt3 = decision_tree(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_dt3, test_accuracy9_dt3, cross_validation9_dt3 = decision_tree(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_dt3 = (train_accuracy1_dt3 + train_accuracy2_dt3 + train_accuracy3_dt3 + train_accuracy4_dt3 + train_accuracy5_dt3 + train_accuracy6_dt3 + train_accuracy7_dt3 + train_accuracy8_dt3 + train_accuracy9_dt3) / 9

cv_average_dt3 = (cross_validation1_dt3 + cross_validation2_dt3 + cross_validation3_dt3 + cross_validation4_dt3 + cross_validation5_dt3 + cross_validation6_dt3 + cross_validation7_dt3 + cross_validation8_dt3 + cross_validation9_dt3) / 9

test_average_dt3 = (test_accuracy1_dt3 + test_accuracy2_dt3 + test_accuracy3_dt3 + test_accuracy4_dt3 + test_accuracy5_dt3 + test_accuracy6_dt3 + test_accuracy7_dt3 + test_accuracy8_dt3 + test_accuracy9_dt3) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_dt3, train_accuracy2_dt3, train_accuracy3_dt3, train_accuracy4_dt3, train_accuracy5_dt3, train_accuracy6_dt3, train_accuracy7_dt3, train_accuracy8_dt3, train_accuracy9_dt3],
    "Test Accuracy": [test_accuracy1_dt3, test_accuracy2_dt3, test_accuracy3_dt3, test_accuracy4_dt3, test_accuracy5_dt3, test_accuracy6_dt3, test_accuracy7_dt3, test_accuracy8_dt3, test_accuracy9_dt3],
    "Cross Validation Accuracy": [cross_validation1_dt3, cross_validation2_dt3, cross_validation3_dt3, cross_validation4_dt3, cross_validation5_dt3, cross_validation6_dt3, cross_validation7_dt3, cross_validation8_dt3, cross_validation9_dt3]
})

averages_dt3 = pd.DataFrame({
    "Train Accuracy Average": [train_average_dt3], 
    "Cross Validation Accuracy Average": [cv_average_dt3],
    "Test Accuracy Average": [test_average_dt3]
})

print(results)
print(averages_dt3)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        0.907407       0.671296                   0.718182
1     50/50        0.907407       0.671296                   0.718182
2     80/20        0.907407       0.671296                   0.718182
3     20/80        0.896296       0.829630                   0.674074
4     50/50        0.755556       0.770370                   0.674074
5     80/20        0.755556       0.770370                   0.674074
6     20/80        0.902778       0.833333                   0.814693
7     50/50        0.902778       0.814815                   0.814693
8     80/20        0.902778       0.833333                   0.814693
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.870885                            0.73565   

   Test Accuracy Average  
0                0.76286  


### Multi-Layer Perceptron

In [69]:
train_accuracy1_mlp3, test_accuracy1_mlp3, cross_validation1_mlp3 = multi_layer_perceptron(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_mlp3, test_accuracy2_mlp3, cross_validation2_mlp3 = multi_layer_perceptron(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_mlp3, test_accuracy3_mlp3, cross_validation3_mlp3 = multi_layer_perceptron(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_mlp3, test_accuracy4_mlp3, cross_validation4_mlp3 = multi_layer_perceptron(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_mlp3, test_accuracy5_mlp3, cross_validation5_mlp3 = multi_layer_perceptron(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_mlp3, test_accuracy6_mlp3, cross_validation6_mlp3 = multi_layer_perceptron(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_mlp3, test_accuracy7_mlp3, cross_validation7_mlp3 = multi_layer_perceptron(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_mlp3, test_accuracy8_mlp3, cross_validation8_mlp3 = multi_layer_perceptron(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_mlp3, test_accuracy9_mlp3, cross_validation9_mlp3 = multi_layer_perceptron(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_mlp3 = (train_accuracy1_mlp3 + train_accuracy2_mlp3 + train_accuracy3_mlp3 + train_accuracy4_mlp3 + train_accuracy5_mlp3 + train_accuracy6_mlp3 + train_accuracy7_mlp3 + train_accuracy8_mlp3 + train_accuracy9_mlp3) / 9

cv_average_mlp3 = (cross_validation1_mlp3 + cross_validation2_mlp3 + cross_validation3_mlp3 + cross_validation4_mlp3 + cross_validation5_mlp3 + cross_validation6_mlp3 + cross_validation7_mlp3 + cross_validation8_mlp3 + cross_validation9_mlp3) / 9

test_average_mlp3 = (test_accuracy1_mlp3 + test_accuracy2_mlp3 + test_accuracy3_mlp3 + test_accuracy4_mlp3 + test_accuracy5_mlp3 + test_accuracy6_mlp3 + test_accuracy7_mlp3 + test_accuracy8_mlp3 + test_accuracy9_mlp3) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_mlp3, train_accuracy2_mlp3, train_accuracy3_mlp3, train_accuracy4_mlp3, train_accuracy5_mlp3, train_accuracy6_mlp3, train_accuracy7_mlp3, train_accuracy8_mlp3, train_accuracy9_mlp3],
    "Test Accuracy": [test_accuracy1_mlp3, test_accuracy2_mlp3, test_accuracy3_mlp3, test_accuracy4_mlp3, test_accuracy5_mlp3, test_accuracy6_mlp3, test_accuracy7_mlp3, test_accuracy8_mlp3, test_accuracy9_mlp3],
    "Cross Validation Accuracy": [cross_validation1_mlp3, cross_validation2_mlp3, cross_validation3_mlp3, cross_validation4_mlp3, cross_validation5_mlp3, cross_validation6_mlp3, cross_validation7_mlp3, cross_validation8_mlp3, cross_validation9_mlp3]
})

averages_mlp3 = pd.DataFrame({
    "Train Accuracy Average": [train_average_mlp3], 
    "Cross Validation Accuracy Average": [cv_average_mlp3],
    "Test Accuracy Average": [test_average_mlp3]
})

print(results)
print(averages_mlp3)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80             1.0       0.750000                   0.834545
1     50/50             1.0       0.763889                   0.852727
2     80/20             1.0       0.754630                   0.852727
3     20/80             1.0       0.807407                   0.814815
4     50/50             1.0       0.800000                   0.800000
5     80/20             1.0       0.807407                   0.800000
6     20/80             1.0       0.814815                   0.800634
7     50/50             1.0       0.796296                   0.796195
8     80/20             1.0       0.851852                   0.791649
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                     1.0                           0.815921   

   Test Accuracy Average  
0               0.794033  


## Now I will use the Breast Cancer Survival Dataset

In [71]:
cancer_surgery.head()

Unnamed: 0,age,operation_year,pos_auxillary_nodes,survival_status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [72]:
X = cancer_surgery.drop(columns="survival_status")

In [73]:
y = cancer_surgery["survival_status"]

In [74]:
X_train_20, X_test_80, y_train_20, y_test_80 = train_test_split(X, y, test_size=0.80, random_state=11) #20/80 split
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X, y, test_size=0.50, random_state=11) #50/50 split
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, y, test_size=0.20, random_state=11) #80/20 split

In [75]:
#Scale Data
scalar = StandardScaler()
X_train_20_s = scalar.fit_transform(X_train_20)
X_test_80_s = scalar.transform(X_test_80)

X_train_50_s = scalar.fit_transform(X_train_50)
X_test_50_s = scalar.transform(X_test_50)

X_train_80_s = scalar.fit_transform(X_train_80)
X_test_20_s = scalar.transform(X_test_20)

### Logistic Regression

In [77]:
train_accuracy1_lr4, test_accuracy1_lr4, cross_validation1_lr4 = logistic_regression(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_lr4, test_accuracy2_lr4, cross_validation2_lr4 = logistic_regression(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_lr4, test_accuracy3_lr4, cross_validation3_lr4 = logistic_regression(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_lr4, test_accuracy4_lr4, cross_validation4_lr4 = logistic_regression(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_lr4, test_accuracy5_lr4, cross_validation5_lr4 = logistic_regression(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_lr4, test_accuracy6_lr4, cross_validation6_lr4 = logistic_regression(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_lr4, test_accuracy7_lr4, cross_validation7_lr4 = logistic_regression(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_lr4, test_accuracy8_lr4, cross_validation8_lr4 = logistic_regression(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_lr4, test_accuracy9_lr4, cross_validation9_lr4 = logistic_regression(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_lr4 = (train_accuracy1_lr4 + train_accuracy2_lr4 + train_accuracy3_lr4 + train_accuracy4_lr4 + train_accuracy5_lr4 + train_accuracy6_lr4 + train_accuracy7_lr4 + train_accuracy8_lr4 + train_accuracy9_lr4) / 9

cv_average_lr4 = (cross_validation1_lr4 + cross_validation2_lr4 + cross_validation3_lr4 + cross_validation4_lr4 + cross_validation5_lr4 + cross_validation6_lr4 + cross_validation7_lr4 + cross_validation8_lr4 + cross_validation9_lr4) / 9

test_average_lr4 = (test_accuracy1_lr4 + test_accuracy2_lr4 + test_accuracy3_lr4 + test_accuracy4_lr4 + test_accuracy5_lr4 + test_accuracy6_lr4 + test_accuracy7_lr4 + test_accuracy8_lr4 + test_accuracy9_lr4) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_lr4, train_accuracy2_lr4, train_accuracy3_lr4, train_accuracy4_lr4, train_accuracy5_lr4, train_accuracy6_lr4, train_accuracy7_lr4, train_accuracy8_lr4, train_accuracy9_lr4],
    "Test Accuracy": [test_accuracy1_lr4, test_accuracy2_lr4, test_accuracy3_lr4, test_accuracy4_lr4, test_accuracy5_lr4, test_accuracy6_lr4, test_accuracy7_lr4, test_accuracy8_lr4, test_accuracy9_lr4],
    "Cross Validation Accuracy": [cross_validation1_lr4, cross_validation2_lr4, cross_validation3_lr4, cross_validation4_lr4, cross_validation5_lr4, cross_validation6_lr4, cross_validation7_lr4, cross_validation8_lr4, cross_validation9_lr4]
})

averages_lr4 = pd.DataFrame({
    "Train Accuracy Average": [train_average_lr4], 
    "Cross Validation Accuracy Average": [cv_average_lr4],
    "Test Accuracy Average": [test_average_lr4]
})

print(results)
print(averages_lr4)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        0.721311        0.75102                   0.719231
1     50/50        0.721311        0.75102                   0.719231
2     80/20        0.721311        0.75102                   0.719231
3     20/80        0.732026        0.75817                   0.725806
4     50/50        0.732026        0.75817                   0.725806
5     80/20        0.732026        0.75817                   0.725806
6     20/80        0.729508        0.83871                   0.729592
7     50/50        0.729508        0.83871                   0.729592
8     80/20        0.729508        0.83871                   0.729592
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.727615                           0.724876   

   Test Accuracy Average  
0               0.782633  


### Random Forests

In [79]:
train_accuracy1_rf4, test_accuracy1_rf4, cross_validation1_rf4 = random_forest(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_rf4, test_accuracy2_rf4, cross_validation2_rf4 = random_forest(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_rf4, test_accuracy3_rf4, cross_validation3_rf4 = random_forest(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_rf4, test_accuracy4_rf4, cross_validation4_rf4 = random_forest(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_rf4, test_accuracy5_rf4, cross_validation5_rf4 = random_forest(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_rf4, test_accuracy6_rf4, cross_validation6_rf4 = random_forest(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_rf4, test_accuracy7_rf4, cross_validation7_rf4 = random_forest(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_rf4, test_accuracy8_rf4, cross_validation8_rf4 = random_forest(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_rf4, test_accuracy9_rf4, cross_validation9_rf4 = random_forest(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_rf4 = (train_accuracy1_rf4 + train_accuracy2_rf4 + train_accuracy3_rf4 + train_accuracy4_rf4 + train_accuracy5_rf4 + train_accuracy6_rf4 + train_accuracy7_rf4 + train_accuracy8_rf4 + train_accuracy9_rf4) / 9

cv_average_rf4 = (cross_validation1_rf4 + cross_validation2_rf4 + cross_validation3_rf4 + cross_validation4_rf4 + cross_validation5_rf4 + cross_validation6_rf4 + cross_validation7_rf4 + cross_validation8_rf4 + cross_validation9_rf4) / 9

test_average_rf4 = (test_accuracy1_rf4 + test_accuracy2_rf4 + test_accuracy3_rf4 + test_accuracy4_rf4 + test_accuracy5_rf4 + test_accuracy6_rf4 + test_accuracy7_rf4 + test_accuracy8_rf4 + test_accuracy9_rf4) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_rf4, train_accuracy2_rf4, train_accuracy3_rf4, train_accuracy4_rf4, train_accuracy5_rf4, train_accuracy6_rf4, train_accuracy7_rf4, train_accuracy8_rf4, train_accuracy9_rf4],
    "Test Accuracy": [test_accuracy1_rf4, test_accuracy2_rf4, test_accuracy3_rf4, test_accuracy4_rf4, test_accuracy5_rf4, test_accuracy6_rf4, test_accuracy7_rf4, test_accuracy8_rf4, test_accuracy9_rf4],
    "Cross Validation Accuracy": [cross_validation1_rf4, cross_validation2_rf4, cross_validation3_rf4, cross_validation4_rf4, cross_validation5_rf4, cross_validation6_rf4, cross_validation7_rf4, cross_validation8_rf4, cross_validation9_rf4]
})

averages_rf4 = pd.DataFrame({
    "Train Accuracy Average": [train_average_rf4], 
    "Cross Validation Accuracy Average": [cv_average_rf4],
    "Test Accuracy Average": [test_average_rf4]
})

print(results)
print(averages_rf4)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        0.770492       0.771429                   0.706410
1     50/50        0.786885       0.742857                   0.689744
2     80/20        0.819672       0.742857                   0.706410
3     20/80        0.797386       0.764706                   0.745376
4     50/50        0.810458       0.764706                   0.752043
5     80/20        0.797386       0.764706                   0.752043
6     20/80        0.782787       0.838710                   0.729507
7     50/50        0.795082       0.822581                   0.737670
8     80/20        0.807377       0.806452                   0.733588
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.796392                           0.728088   

   Test Accuracy Average  
0               0.779889  


### SVM 

In [81]:
train_accuracy1_svm4, test_accuracy1_svm4, cross_validation1_svm4 = svm(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_svm4, test_accuracy2_svm4, cross_validation2_svm4 = svm(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_svm4, test_accuracy3_svm4, cross_validation3_svm4 = svm(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_svm4, test_accuracy4_svm4, cross_validation4_svm4 = svm(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_svm4, test_accuracy5_svm4, cross_validation5_svm4 = svm(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_svm4, test_accuracy6_svm4, cross_validation6_svm4 = svm(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_svm4, test_accuracy7_svm4, cross_validation7_svm4 = svm(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_svm4, test_accuracy8_svm4, cross_validation8_svm4 = svm(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_svm4, test_accuracy9_svm4, cross_validation9_svm4 = svm(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_svm4 = (train_accuracy1_svm4 + train_accuracy2_svm4 + train_accuracy3_svm4 + train_accuracy4_svm4 + train_accuracy5_svm4 + train_accuracy6_svm4 + train_accuracy7_svm4 + train_accuracy8_svm4 + train_accuracy9_svm4) / 9

cv_average_svm4 = (cross_validation1_svm4 + cross_validation2_svm4 + cross_validation3_svm4 + cross_validation4_svm4 + cross_validation5_svm4 + cross_validation6_svm4 + cross_validation7_svm4 + cross_validation8_svm4 + cross_validation9_svm4) / 9

test_average_svm4 = (test_accuracy1_svm4 + test_accuracy2_svm4 + test_accuracy3_svm4 + test_accuracy4_svm4 + test_accuracy5_svm4 + test_accuracy6_svm4 + test_accuracy7_svm4 + test_accuracy8_svm4 + test_accuracy9_svm4) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_svm4, train_accuracy2_svm4, train_accuracy3_svm4, train_accuracy4_svm4, train_accuracy5_svm4, train_accuracy6_svm4, train_accuracy7_svm4, train_accuracy8_svm4, train_accuracy9_svm4],
    "Test Accuracy": [test_accuracy1_svm4, test_accuracy2_svm4, test_accuracy3_svm4, test_accuracy4_svm4, test_accuracy5_svm4, test_accuracy6_svm4, test_accuracy7_svm4, test_accuracy8_svm4, test_accuracy9_svm4],
    "Cross Validation Accuracy": [cross_validation1_svm4, cross_validation2_svm4, cross_validation3_svm4, cross_validation4_svm4, cross_validation5_svm4, cross_validation6_svm4, cross_validation7_svm4, cross_validation8_svm4, cross_validation9_svm4]
})

averages_svm4 = pd.DataFrame({
    "Train Accuracy Average": [train_average_svm4], 
    "Cross Validation Accuracy Average": [cv_average_svm4],
    "Test Accuracy Average": [test_average_svm4]
})

print(results)
print(averages_svm4)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        0.704918       0.759184                   0.703846
1     50/50        0.704918       0.759184                   0.703846
2     80/20        0.704918       0.759184                   0.703846
3     20/80        0.751634       0.751634                   0.738710
4     50/50        0.751634       0.751634                   0.738710
5     80/20        0.751634       0.751634                   0.738710
6     20/80        0.778689       0.822581                   0.729337
7     50/50        0.778689       0.822581                   0.729337
8     80/20        0.778689       0.822581                   0.729337
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                 0.74508                           0.723964   

   Test Accuracy Average  
0               0.777799  


### Decision Tree

In [83]:
train_accuracy1_dt4, test_accuracy1_dt4, cross_validation1_dt4 = decision_tree(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_dt4, test_accuracy2_dt4, cross_validation2_dt4 = decision_tree(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_dt4, test_accuracy3_dt4, cross_validation3_dt4 = decision_tree(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_dt4, test_accuracy4_dt4, cross_validation4_dt4 = decision_tree(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_dt4, test_accuracy5_dt4, cross_validation5_dt4 = decision_tree(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_dt4, test_accuracy6_dt4, cross_validation6_dt4 = decision_tree(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_dt4, test_accuracy7_dt4, cross_validation7_dt4 = decision_tree(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_dt4, test_accuracy8_dt4, cross_validation8_dt4 = decision_tree(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_dt4, test_accuracy9_dt4, cross_validation9_dt4 = decision_tree(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_dt4 = (train_accuracy1_dt4 + train_accuracy2_dt4 + train_accuracy3_dt4 + train_accuracy4_dt4 + train_accuracy5_dt4 + train_accuracy6_dt4 + train_accuracy7_dt4 + train_accuracy8_dt4 + train_accuracy9_dt4) / 9

cv_average_dt4 = (cross_validation1_dt4 + cross_validation2_dt4 + cross_validation3_dt4 + cross_validation4_dt4 + cross_validation5_dt4 + cross_validation6_dt4 + cross_validation7_dt4 + cross_validation8_dt4 + cross_validation9_dt4) / 9

test_average_dt4 = (test_accuracy1_dt4 + test_accuracy2_dt4 + test_accuracy3_dt4 + test_accuracy4_dt4 + test_accuracy5_dt4 + test_accuracy6_dt4 + test_accuracy7_dt4 + test_accuracy8_dt4 + test_accuracy9_dt4) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_dt4, train_accuracy2_dt4, train_accuracy3_dt4, train_accuracy4_dt4, train_accuracy5_dt4, train_accuracy6_dt4, train_accuracy7_dt4, train_accuracy8_dt4, train_accuracy9_dt4],
    "Test Accuracy": [test_accuracy1_dt4, test_accuracy2_dt4, test_accuracy3_dt4, test_accuracy4_dt4, test_accuracy5_dt4, test_accuracy6_dt4, test_accuracy7_dt4, test_accuracy8_dt4, test_accuracy9_dt4],
    "Cross Validation Accuracy": [cross_validation1_dt4, cross_validation2_dt4, cross_validation3_dt4, cross_validation4_dt4, cross_validation5_dt4, cross_validation6_dt4, cross_validation7_dt4, cross_validation8_dt4, cross_validation9_dt4]
})

averages_dt4 = pd.DataFrame({
    "Train Accuracy Average": [train_average_dt4], 
    "Cross Validation Accuracy Average": [cv_average_dt4],
    "Test Accuracy Average": [test_average_dt4]
})

print(results)
print(averages_dt4)

  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        0.704918       0.771429                   0.673077
1     50/50        0.704918       0.771429                   0.673077
2     80/20        0.704918       0.771429                   0.673077
3     20/80        0.764706       0.732026                   0.732043
4     50/50        0.764706       0.732026                   0.732043
5     80/20        0.764706       0.732026                   0.732043
6     20/80        0.774590       0.790323                   0.753997
7     50/50        0.774590       0.790323                   0.753997
8     80/20        0.774590       0.790323                   0.753997
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.748071                           0.719706   

   Test Accuracy Average  
0               0.764592  


### Multi-Layer Perceptron

In [85]:
train_accuracy1_mlp4, test_accuracy1_mlp4, cross_validation1_mlp4 = multi_layer_perceptron(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy2_mlp4, test_accuracy2_mlp4, cross_validation2_mlp4 = multi_layer_perceptron(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy3_mlp4, test_accuracy3_mlp4, cross_validation3_mlp4 = multi_layer_perceptron(X_train_20_s, y_train_20, X_test_80_s, y_test_80)

train_accuracy4_mlp4, test_accuracy4_mlp4, cross_validation4_mlp4 = multi_layer_perceptron(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy5_mlp4, test_accuracy5_mlp4, cross_validation5_mlp4 = multi_layer_perceptron(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy6_mlp4, test_accuracy6_mlp4, cross_validation6_mlp4 = multi_layer_perceptron(X_train_50_s, y_train_50, X_test_50_s, y_test_50)

train_accuracy7_mlp4, test_accuracy7_mlp4, cross_validation7_mlp4 = multi_layer_perceptron(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy8_mlp4, test_accuracy8_mlp4, cross_validation8_mlp4 = multi_layer_perceptron(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

train_accuracy9_mlp4, test_accuracy9_mlp4, cross_validation9_mlp4 = multi_layer_perceptron(X_train_80_s, y_train_80, X_test_20_s, y_test_20)

# Averages across all splits
train_average_mlp4 = (train_accuracy1_mlp4 + train_accuracy2_mlp4 + train_accuracy3_mlp4 + train_accuracy4_mlp4 + train_accuracy5_mlp4 + train_accuracy6_mlp4 + train_accuracy7_mlp4 + train_accuracy8_mlp4 + train_accuracy9_mlp4) / 9

cv_average_mlp4 = (cross_validation1_mlp4 + cross_validation2_mlp4 + cross_validation3_mlp4 + cross_validation4_mlp4 + cross_validation5_mlp4 + cross_validation6_mlp4 + cross_validation7_mlp4 + cross_validation8_mlp4 + cross_validation9_mlp4) / 9

test_average_mlp4 = (test_accuracy1_mlp4 + test_accuracy2_mlp4 + test_accuracy3_mlp4 + test_accuracy4_mlp4 + test_accuracy5_mlp4 + test_accuracy6_mlp4 + test_accuracy7_mlp4 + test_accuracy8_mlp4 + test_accuracy9_mlp4) / 9

results = pd.DataFrame({
    "Partition" : ["20/80", "50/50", "80/20", "20/80", "50/50", "80/20", "20/80", "50/50", "80/20"], 
    "Train Accuracy": [train_accuracy1_mlp4, train_accuracy2_mlp4, train_accuracy3_mlp4, train_accuracy4_mlp4, train_accuracy5_mlp4, train_accuracy6_mlp4, train_accuracy7_mlp4, train_accuracy8_mlp4, train_accuracy9_mlp4],
    "Test Accuracy": [test_accuracy1_mlp4, test_accuracy2_mlp4, test_accuracy3_mlp4, test_accuracy4_mlp4, test_accuracy5_mlp4, test_accuracy6_mlp4, test_accuracy7_mlp4, test_accuracy8_mlp4, test_accuracy9_mlp4],
    "Cross Validation Accuracy": [cross_validation1_mlp4, cross_validation2_mlp4, cross_validation3_mlp4, cross_validation4_mlp4, cross_validation5_mlp4, cross_validation6_mlp4, cross_validation7_mlp4, cross_validation8_mlp4, cross_validation9_mlp4]
})

averages_mlp4 = pd.DataFrame({
    "Train Accuracy Average": [train_average_mlp4], 
    "Cross Validation Accuracy Average": [cv_average_mlp4],
    "Test Accuracy Average": [test_average_mlp4]
})

print(results)
print(averages_mlp4)



  Partition  Train Accuracy  Test Accuracy  Cross Validation Accuracy
0     20/80        0.983607       0.571429                   0.639744
1     50/50        0.983607       0.563265                   0.638462
2     80/20        0.983607       0.575510                   0.655128
3     20/80        0.986928       0.712418                   0.607097
4     50/50        0.915033       0.692810                   0.627527
5     80/20        0.986928       0.692810                   0.607527
6     20/80        0.795082       0.822581                   0.643452
7     50/50        0.848361       0.806452                   0.630952
8     80/20        0.799180       0.822581                   0.635204
   Train Accuracy Average  Cross Validation Accuracy Average  \
0                0.920259                           0.631677   

   Test Accuracy Average  
0                0.69554  




## Results Across All Classifiers and Datasets

### Train, Validation, and Test Averages for each Classifer 

In [88]:
train_average_lr = (train_average_lr1 + train_average_lr2 + train_average_lr3 + train_average_lr4) / 4
train_average_rf = (train_average_rf1 + train_average_rf2 + train_average_rf3 + train_average_rf4) / 4
train_average_svm = (train_average_svm1 + train_average_svm2 + train_average_svm3 + train_average_svm4) / 4
train_average_dt = (train_average_dt1 + train_average_dt2 + train_average_dt3 + train_average_dt4) / 4
train_average_mlp = (train_average_mlp1 + train_average_mlp2 + train_average_mlp3 + train_average_mlp4) / 4

cv_average_lr = (cv_average_lr1 + cv_average_lr2 + cv_average_lr3 + cv_average_lr4) / 4
cv_average_rf = (cv_average_rf1 + cv_average_rf2 + cv_average_rf3 + cv_average_lr4) / 4
cv_average_svm = (cv_average_svm1 + cv_average_svm2 + cv_average_svm3 + cv_average_lr4) / 4
cv_average_dt = (cv_average_dt1 + cv_average_dt2 + cv_average_dt3 + cv_average_lr4) / 4
cv_average_mlp = (cv_average_mlp1 + cv_average_mlp2 + cv_average_mlp3 + cv_average_lr4) / 4

test_average_lr = (test_average_lr1 + test_average_lr2 + test_average_lr3 + test_average_lr4) / 4
test_average_rf = (test_average_rf1 + test_average_rf2 + test_average_rf3 + test_average_lr4) / 4
test_average_svm = (test_average_svm1 + test_average_svm2 + test_average_svm3 + test_average_lr4) / 4
test_average_dt = (cv_average_dt1 + cv_average_dt2 + cv_average_dt3 + cv_average_dt4) / 4
test_average_mlp = (cv_average_mlp1 + cv_average_mlp2 + cv_average_mlp3 + cv_average_lr4) / 4


averages = pd.DataFrame({
    "Classifier" : ["Logistic Regression", "Random Forests", "SVM", "Decision Tree", "Multi-Layer Perceptron"],
    "Train Accuracy Average" : [train_average_lr, train_average_rf, train_average_svm, train_average_dt, train_average_mlp], 
    "Cross Validation Accuracy Average" : [cv_average_lr, cv_average_rf, cv_average_svm, cv_average_dt, cv_average_mlp], 
    "Test Accuracy Average" : [test_average_lr, test_average_rf, test_average_svm, test_average_dt, test_average_mlp]
})

print(averages)

               Classifier  Train Accuracy Average  \
0     Logistic Regression                0.874777   
1          Random Forests                0.909979   
2                     SVM                0.894592   
3           Decision Tree                0.876710   
4  Multi-Layer Perceptron                0.980065   

   Cross Validation Accuracy Average  Test Accuracy Average  
0                           0.839792               0.844453  
1                           0.853165               0.860809  
2                           0.865218               0.880037  
3                           0.791422               0.790130  
4                           0.852472               0.852472  
