In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split

## Looking at all our Datasets:
1. Breast Cancer Dataset - looking to see if we can classify if the culture is malignant or benign (2 or 4) based on the attributes
2. Wine Dataset - looking to see if we can classify the class of the wine based on its attributes
3. Student Dataset - Predicting student performance based on various attributes

In [63]:
breast_cancer_df = pd.read_csv("breastcancer/breast-cancer-wisconsin.data", 
                       names = ["Sample code number", "Clump Thickness", "Uniformity of Cell Size", 
                                "Uniformity of Cell Shape", "Marginal Adhesion", 
                                "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", 
                                "Normal Nucleoli", "Mitoses", "Class"])
breast_cancer_df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [64]:
breast_cancer = breast_cancer_df[breast_cancer_df['Bare Nuclei'] != '?']
breast_cancer

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [65]:
wine_df = pd.read_csv("wine/wine.data", names = ["class", "alcohol", "malic acid", "ash", "alcalinity of ash", "magnesium", 
                                                 "total phenols", "flavanoids", "nonflavanoid phenols", "proanthocyanins",
                                                 "color intensity", "hue", "OD280/OD315 of diluted wines", "proline"])
wine_df = wine_df.reset_index()
wine_df = wine_df.drop(wine_df.columns[0], axis = 1)
wine_df

Unnamed: 0,class,alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD280/OD315 of diluted wines,proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [66]:
student_math = pd.read_csv("student/student-mat.csv", delimiter=";")
student_por = pd.read_csv("student/student-por.csv", delimiter=";")
student_math['subject'] = 'Math'
student_por['subject'] = 'Portugese'
student_df = pd.concat([student_math, student_por], axis=0, ignore_index=True)
student_df.columns
student_df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,subject
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,6,5,6,6,Math
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,4,5,5,6,Math
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,10,7,8,10,Math
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,2,15,14,15,Math
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,4,6,10,10,Math
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039,MS,F,19,R,GT3,T,2,3,services,other,...,4,2,1,2,5,4,10,11,10,Portugese
1040,MS,F,18,U,LE3,T,3,1,teacher,services,...,3,4,1,1,1,4,15,15,16,Portugese
1041,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,5,6,11,12,9,Portugese
1042,MS,M,17,U,LE3,T,3,1,services,services,...,4,5,3,4,2,6,10,10,10,Portugese


## Wine Dataset - Logistic Regression (50/50, 20/80 and 80/20 split)
## 20/ 20 SPLIT - Trial 1

In [67]:
wine_df = wine_df[wine_df['class'] != 3]
wine_df

Unnamed: 0,class,alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD280/OD315 of diluted wines,proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,2,12.07,2.16,2.17,21.0,85,2.60,2.65,0.37,1.35,2.76,0.86,3.28,378
126,2,12.43,1.53,2.29,21.5,86,2.74,3.15,0.39,1.77,3.94,0.69,2.84,352
127,2,11.79,2.13,2.78,28.5,92,2.13,2.24,0.58,1.76,3.00,0.97,2.44,466
128,2,12.37,1.63,2.30,24.5,88,2.22,2.45,0.40,1.90,2.12,0.89,2.78,342


In [68]:
wine_df_shuffle = wine_df.sample(frac = 1).reset_index(drop = True)
wine_df_shuffle

Unnamed: 0,class,alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD280/OD315 of diluted wines,proline
0,1,13.83,1.57,2.62,20.0,115,2.95,3.40,0.40,1.72,6.60,1.13,2.57,1130
1,2,12.42,4.43,2.73,26.5,102,2.20,2.13,0.43,1.71,2.08,0.92,3.12,365
2,2,12.51,1.73,1.98,20.5,85,2.20,1.92,0.32,1.48,2.94,1.04,3.57,672
3,2,13.86,1.51,2.67,25.0,86,2.95,2.86,0.21,1.87,3.38,1.36,3.16,410
4,2,12.64,1.36,2.02,16.8,100,2.02,1.41,0.53,0.62,5.75,0.98,1.59,450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,1,13.64,3.10,2.56,15.2,116,2.70,3.03,0.17,1.66,5.10,0.96,3.36,845
126,2,11.46,3.74,1.82,19.5,107,3.18,2.58,0.24,3.58,2.90,0.75,2.81,562
127,2,11.84,2.89,2.23,18.0,112,1.72,1.32,0.43,0.95,2.65,0.96,2.52,500
128,1,12.93,3.80,2.65,18.6,102,2.41,2.41,0.25,1.98,4.50,1.03,3.52,770


In [69]:
wine_X = wine_df_shuffle[wine_df_shuffle.columns[1:]].to_numpy()
wine_Y = wine_df_shuffle[wine_df_shuffle.columns[0]].to_numpy()

In [70]:
X_train, X_test, y_train, y_test = train_test_split(wine_X, wine_Y, test_size=0.2, random_state=42)

In [71]:
#classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [72]:
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
scaler = StandardScaler()
scaler.fit(X_train)
X_train_new = scaler.transform(X_train)

# call classifier and get the best hyper parameter for this case - combinations of above params
classifier = LogisticRegression(class_weight = 'balanced')
class_hyper_tune = GridSearchCV(classifier, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tune.fit(X_train_new, y_train)
print("training score for best hyperparameter " + str(class_hyper_tune.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tune.cv_results_.get('mean_test_score')[0]))
class_hyper_tune.best_estimator_

#call classifier and get the best hyper parameter for this case - trial 2
# params_C = {'C': [0.001, 0.01, 0.1, 1]}
# classifier1 = LogisticRegression(solver = 'liblinear', class_weight = 'balanced')
# class_hyper_tune1 = GridSearchCV(classifier1, param_grid = params_C, cv = 3, return_train_score = True)
# class_hyper_tune1.fit(X_train_new, y_train)
# print("training score for best hyperparameter " + str(class_hyper_tune1.cv_results_.get('mean_train_score')[0]))
# print("test score for best hyperparameter " + str(class_hyper_tune1.cv_results_.get('mean_test_score')[0]))
# class_hyper_tune1.best_estimator_

# params_C = {'C': [0.001, 0.01, 0.1, 1]}
# classifier2 = LogisticRegression(solver = 'liblinear', class_weight = 'balanced')
# class_hyper_tune2 = GridSearchCV(classifier2, param_grid = params_C, cv = 3, return_train_score = True)
# class_hyper_tune2.fit(X_train_new, y_train)
# print("training score for best hyperparameter " + str(class_hyper_tune2.cv_results_.get('mean_train_score')[0]))
# print("test score for best hyperparameter " + str(class_hyper_tune2.cv_results_.get('mean_test_score')[0]))
# class_hyper_tune2.best_estimator_

training score for best hyperparameter 0.9567977915804002
test score for best hyperparameter 0.9521008403361345


In [73]:
classifier_tuned = LogisticRegression(C = 1, class_weight = 'balanced')
classifier_tuned.fit(X_train, y_train)
classifier_tuned.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


1.0

In [74]:
classifier_tuned.score(X_train, y_train)

0.9807692307692307

In [75]:
wine_df_shuffle_1 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X_1 = wine_df_shuffle_1[wine_df_shuffle_1.columns[1:]].to_numpy()
wine_Y_1 = wine_df_shuffle_1[wine_df_shuffle_1.columns[0]].to_numpy()

In [76]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(wine_X_1, wine_Y_1, test_size=0.2, random_state=42)

classifier_tuned = LogisticRegression(C = 1, class_weight = 'balanced')
classifier_tuned.fit(X_train1, y_train1)
classifier_tuned.score(X_test1, y_test1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9615384615384616

In [77]:
classifier_tuned.score(X_train1, y_train1)

0.9807692307692307

In [78]:
wine_df_shuffle_2 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X_2 = wine_df_shuffle_2[wine_df_shuffle_2.columns[1:]].to_numpy()
wine_Y_2 = wine_df_shuffle_2[wine_df_shuffle_2.columns[0]].to_numpy()
X_train2, X_test2, y_train2, y_test2 = train_test_split(wine_X_2, wine_Y_2, test_size=0.2, random_state=42)
classifier_tuned = LogisticRegression(C = 1, class_weight = 'balanced')
classifier_tuned.fit(X_train2, y_train2)
classifier_tuned.score(X_test2, y_test2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9230769230769231

In [79]:
classifier_tuned.score(X_train2, y_train2)

1.0

In [82]:
avg_testscore = (0.9615384615384616 + 1.0 + 0.9230769230769231) / 3
avgtrain_score = (classifier_tuned.score(X_train, y_train) + classifier_tuned.score(X_train1, y_train1) + classifier_tuned.score(X_train2, y_train2)) / 3
print(avg_testscore)
print(avgtrain_score)

0.9615384615384617
0.9871794871794872


## Wine Dataset 50/50 Split

In [83]:
wine_df_shuffle_50 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_df_shuffle_50

Unnamed: 0,class,alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD280/OD315 of diluted wines,proline
0,1,13.28,1.64,2.84,15.5,110,2.60,2.68,0.34,1.36,4.60,1.09,2.78,880
1,2,12.33,0.99,1.95,14.8,136,1.90,1.85,0.35,2.76,3.40,1.06,2.31,750
2,2,12.08,1.13,2.51,24.0,78,2.00,1.58,0.40,1.40,2.20,1.31,2.72,630
3,2,12.00,3.43,2.00,19.0,87,2.00,1.64,0.37,1.87,1.28,0.93,3.05,564
4,1,13.48,1.81,2.41,20.5,100,2.70,2.98,0.26,1.86,5.10,1.04,3.47,920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,1,13.76,1.53,2.70,19.5,132,2.95,2.74,0.50,1.35,5.40,1.25,3.00,1235
126,2,12.25,1.73,2.12,19.0,80,1.65,2.03,0.37,1.63,3.40,1.00,3.17,510
127,1,13.83,1.65,2.60,17.2,94,2.45,2.99,0.22,2.29,5.60,1.24,3.37,1265
128,2,12.29,1.41,1.98,16.0,85,2.55,2.50,0.29,1.77,2.90,1.23,2.74,428


In [84]:
wine_X_50 = wine_df_shuffle_50[wine_df_shuffle_50.columns[1:]].to_numpy()
wine_Y_50 = wine_df_shuffle_50[wine_df_shuffle_50.columns[0]].to_numpy()

In [85]:
X_train, X_test, y_train, y_test = train_test_split(wine_X_50, wine_Y_50, test_size=0.5, random_state=42)

In [86]:
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
scaler_new = StandardScaler()
scaler_new.fit(X_train)
X_train = scaler.transform(X_train)

# call classifier and get the best hyper parameter for this case - combinations of above params
classifier1 = LogisticRegression(class_weight = 'balanced')
class_hyper_tune1 = GridSearchCV(classifier1, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tune1.fit(X_train, y_train)
print("training score for best hyperparameter " + str(class_hyper_tune1.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tune1.cv_results_.get('mean_test_score')[0]))
class_hyper_tune1.best_estimator_

training score for best hyperparameter 0.9770965468639887
test score for best hyperparameter 0.9696969696969697


In [87]:
classifier_tuned1 = LogisticRegression(C = 0.1, class_weight = 'balanced')
classifier_tuned1.fit(X_train, y_train)
classifier_tuned1.score(X_test, y_test)

0.46153846153846156

In [88]:
wine_df_shuffle_2 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X_2 = wine_df_shuffle_2[wine_df_shuffle_2.columns[1:]].to_numpy()
wine_Y_2 = wine_df_shuffle_2[wine_df_shuffle_2.columns[0]].to_numpy()
X_train2, X_test2, y_train2, y_test2 = train_test_split(wine_X_2, wine_Y_2, test_size=0.5, random_state=42)
classifier_tuned1.fit(X_train2, y_train2)
classifier_tuned1.score(X_test2, y_test2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9538461538461539

In [89]:
wine_df_shuffle_3 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X_3 = wine_df_shuffle_3[wine_df_shuffle_3.columns[1:]].to_numpy()
wine_Y_3 = wine_df_shuffle_3[wine_df_shuffle_3.columns[0]].to_numpy()
X_train3, X_test3, y_train3, y_test3 = train_test_split(wine_X_3, wine_Y_3, test_size=0.5, random_state=42)
classifier_tuned1.fit(X_train3, y_train3)
classifier_tuned1.score(X_test3, y_test3)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9538461538461539

In [90]:
test_avg_score = (0.46153846153846156
 + 0.9538461538461539 + 0.9538461538461539) / 3
test_avg_score

0.7897435897435897

In [91]:
train_avg = (classifier_tuned1.score(X_train, y_train) + classifier_tuned1.score(X_train2, y_train2) + classifier_tuned1.score(X_train3, y_train3)) / 3
train_avg

0.841025641025641

## Wine Dataset 80/20 split (TRIAL 1)

In [92]:
wine_df_shuffle_80 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X_80 = wine_df_shuffle_80[wine_df_shuffle_80.columns[1:]].to_numpy()
wine_Y_80 = wine_df_shuffle_80[wine_df_shuffle_80.columns[0]].to_numpy()
X_train80, X_test80, y_train80, y_test80 = train_test_split(wine_X_80, wine_Y_80, test_size=0.8, random_state=42)

In [93]:
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
scaler_new80 = StandardScaler()
scaler_new80.fit(X_train80)
X_train80_new = scaler.transform(X_train80)

# call classifier and get the best hyper parameter for this case - combinations of above params
classifier8 = LogisticRegression(class_weight = 'balanced')
class_hyper_tune8 = GridSearchCV(classifier8, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tune8.fit(X_train80, y_train80)
print("training score for best hyperparameter " + str(class_hyper_tune8.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tune8.cv_results_.get('mean_test_score')[0]))
class_hyper_tune8.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

training score for best hyperparameter 0.9041394335511983
test score for best hyperparameter 0.9212962962962963


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [94]:
classifier_tuned8 = LogisticRegression(C = 0.1, class_weight = 'balanced')
classifier_tuned8.fit(X_train80, y_train80)
classifier_tuned8.score(X_test80, y_test80)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8365384615384616

In [95]:
classifier_tuned8.score(X_train80, y_train80)

0.9615384615384616

## Trial 2

In [96]:
wine_df_shuffle_ = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X_ = wine_df_shuffle_[wine_df_shuffle_.columns[1:]].to_numpy()
wine_Y_ = wine_df_shuffle_[wine_df_shuffle_.columns[0]].to_numpy()
X_train_, X_test_, y_train_, y_test_ = train_test_split(wine_X_, wine_Y_, test_size=0.8, random_state=42)
classifier_tuned8.fit(X_train_, y_train_)
classifier_tuned8.score(X_test_, y_test_)

0.9134615384615384

## Trial 3

In [97]:
wine_df_shuffle1 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X1 = wine_df_shuffle1[wine_df_shuffle1.columns[1:]].to_numpy()
wine_Y1 = wine_df_shuffle1[wine_df_shuffle1.columns[0]].to_numpy()
X_train1, X_test1, y_train1, y_test1 = train_test_split(wine_X1, wine_Y1, test_size=0.8, random_state=42)
classifier_tuned8.fit(X_train1, y_train1)
classifier_tuned8.score(X_test1, y_test1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9326923076923077

In [98]:
avg_test_80 = (0.9326923076923077 + 0.9134615384615384 + 0.8365384615384616) / 3
avg_test_80

0.8942307692307693

In [99]:
avg_train_80 = (classifier_tuned8.score(X_train80, y_train80) + classifier_tuned8.score(X_train1, y_train1) + classifier_tuned8.score(X_train_, y_train_)) / 3
avg_train_80

0.9358974358974358

## Now, doing KNN on all the splits, 20/80, 50/50 and 80/20
## 1. KNN on 20/80

In [100]:
from sklearn.neighbors import KNeighborsClassifier
wine_df_shuffled = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xs = wine_df_shuffled[wine_df_shuffled.columns[1:]].to_numpy()
wine_Ys = wine_df_shuffled[wine_df_shuffled.columns[0]].to_numpy()
X_trains, X_tests, y_trains, y_tests = train_test_split(wine_Xs, wine_Ys, test_size=0.2, random_state=42)

params = {'n_neighbors' : [3, 5, 7, 10]}

# call classifier and get the best hyper parameter for this case - combinations of above params
classifiers = KNeighborsClassifier(weights = 'uniform')
class_hyper_tunes = GridSearchCV(classifiers, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunes.fit(X_trains, y_trains)
print("training score for best hyperparameter " + str(class_hyper_tunes.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunes.cv_results_.get('mean_test_score')[0]))
class_hyper_tunes.best_estimator_

training score for best hyperparameter 0.9423740510697032
test score for best hyperparameter 0.9324929971988795


In [101]:
tuned_KNN = KNeighborsClassifier(n_neighbors = 10)
tuned_KNN.fit(X_trains, y_trains)
tuned_KNN.score(X_tests, y_tests)

0.9230769230769231

## Trial 2

In [102]:
wine_df_shuffleds = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xss = wine_df_shuffleds[wine_df_shuffleds.columns[1:]].to_numpy()
wine_Yss = wine_df_shuffleds[wine_df_shuffleds.columns[0]].to_numpy()
X_trainss, X_testss, y_trainss, y_testss = train_test_split(wine_Xss, wine_Yss, test_size=0.2, random_state=42)
tuned_KNN.fit(X_trainss, y_trainss)
tuned_KNN.score(X_testss, y_testss)

0.8461538461538461

## Trial 3

In [103]:
wine_df_shuffledsw = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xssw = wine_df_shuffledsw[wine_df_shuffledsw.columns[1:]].to_numpy()
wine_Yssw = wine_df_shuffledsw[wine_df_shuffledsw.columns[0]].to_numpy()
X_trainssw, X_testssw, y_trainssw, y_testssw = train_test_split(wine_Xssw, wine_Yssw, test_size=0.2, random_state=42)
tuned_KNN.fit(X_trainssw, y_trainssw)
tuned_KNN.score(X_testssw, y_testssw)

0.9615384615384616

In [104]:
knn_test_avg_20 = (0.8461538461538461 + 0.9230769230769231 + 0.9615384615384616) / 3
knn_test_avg_20

0.9102564102564102

In [105]:
(tuned_KNN.score(X_trainssw, y_trainssw) + 
tuned_KNN.score(X_trainss, y_trainss) + 
tuned_KNN.score(X_trains, y_trains)) / 3

0.9583333333333334

## KNN 50/50 TRIAL

In [106]:
wine_df_shuf = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xu = wine_df_shuf[wine_df_shuf.columns[1:]].to_numpy()
wine_Yu = wine_df_shuf[wine_df_shuf.columns[0]].to_numpy()
X_trainu, X_testu, y_trainu, y_testu = train_test_split(wine_Xu, wine_Yu, test_size=0.5, random_state=42)

params = {'n_neighbors' : [3, 5, 7, 10]}

# call classifier and get the best hyper parameter for this case - combinations of above params
classifiersu = KNeighborsClassifier(weights = 'uniform')
class_hyper_tunesu = GridSearchCV(classifiersu, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunesu.fit(X_trainu, y_trainu)
print("training score for best hyperparameter " + str(class_hyper_tunesu.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunesu.cv_results_.get('mean_test_score')[0]))
class_hyper_tunesu.best_estimator_

training score for best hyperparameter 0.9538407329105004
test score for best hyperparameter 0.9076479076479077


## trial 1

In [107]:
tuned_50 = KNeighborsClassifier(n_neighbors = 3)
tuned_50.fit(X_trainu, y_trainu)
tuned_50.score(X_testu, y_testu)

0.9384615384615385

## trial 2

In [108]:
wine_df_shufl = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xul = wine_df_shufl[wine_df_shufl.columns[1:]].to_numpy()
wine_Yul = wine_df_shufl[wine_df_shufl.columns[0]].to_numpy()
X_trainul, X_testul, y_trainul, y_testul = train_test_split(wine_Xul, wine_Yul, test_size=0.5, random_state=42)
tuned_50.fit(X_trainul, y_trainul)
tuned_50.score(X_testul, y_testul)

0.8615384615384616

## Trial 3

In [109]:
wine_df_shufle = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xule = wine_df_shufle[wine_df_shufle.columns[1:]].to_numpy()
wine_Yule = wine_df_shufle[wine_df_shufle.columns[0]].to_numpy()
X_trainule, X_testule, y_trainule, y_testule = train_test_split(wine_Xule, wine_Yule, test_size=0.5, random_state=42)
tuned_50.fit(X_trainule, y_trainule)
tuned_50.score(X_testule, y_testule)

0.8307692307692308

In [110]:
test_avg_knn_50 = (0.8615384615384616 + 0.9384615384615385 + 0.8307692307692308) / 3
test_avg_knn_50

0.8769230769230769

In [111]:
(tuned_50.score(X_trainul, y_trainul) + 
tuned_50.score(X_trainule, y_trainule) + 
tuned_50.score(X_trainu, y_trainu)) / 3

0.9487179487179488

## Trial 3 KNN 80/20 

In [112]:
from sklearn.neighbors import KNeighborsClassifier
wine_df_shuffled = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xs = wine_df_shuffled[wine_df_shuffled.columns[1:]].to_numpy()
wine_Ys = wine_df_shuffled[wine_df_shuffled.columns[0]].to_numpy()
X_trains, X_tests, y_trains, y_tests = train_test_split(wine_Xs, wine_Ys, test_size=0.2, random_state=42)

params = {'n_neighbors' : [3, 5, 7, 10]}

# have to scale data you ran into an error

# call classifier and get the best hyper parameter for this case - combinations of above params
classifiers = KNeighborsClassifier(weights = 'uniform')
class_hyper_tunes = GridSearchCV(classifiers, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunes.fit(X_trains, y_trains)
print("training score for best hyperparameter " + str(class_hyper_tunes.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunes.cv_results_.get('mean_test_score')[0]))
class_hyper_tunes.best_estimator_

training score for best hyperparameter 0.9567287784679089
test score for best hyperparameter 0.942577030812325


In [113]:
tuned_KNN = KNeighborsClassifier(n_neighbors = 10)
tuned_KNN.fit(X_trains, y_trains)
tuned_KNN.score(X_tests, y_tests)

0.9230769230769231

In [114]:
wine_df_shuffleds = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xss = wine_df_shuffleds[wine_df_shuffleds.columns[1:]].to_numpy()
wine_Yss = wine_df_shuffleds[wine_df_shuffleds.columns[0]].to_numpy()
X_trainss, X_testss, y_trainss, y_testss = train_test_split(wine_Xss, wine_Yss, test_size=0.2, random_state=42)
tuned_KNN.fit(X_trainss, y_trainss)
tuned_KNN.score(X_testss, y_testss)

0.9230769230769231

In [115]:
wine_df_shuffledsw = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xssw = wine_df_shuffledsw[wine_df_shuffledsw.columns[1:]].to_numpy()
wine_Yssw = wine_df_shuffledsw[wine_df_shuffledsw.columns[0]].to_numpy()
X_trainssw, X_testssw, y_trainssw, y_testssw = train_test_split(wine_Xssw, wine_Yssw, test_size=0.2, random_state=42)
tuned_KNN.fit(X_trainssw, y_trainssw)
tuned_KNN.score(X_testssw, y_testssw)

0.8846153846153846

In [116]:
test_avg_knn_80 = (0.9230769230769231 + 0.8846153846153846 + 0.9230769230769231) / 3
test_avg_knn_80

0.9102564102564102

In [117]:
(tuned_KNN.score(X_trainssw, y_trainssw) + 
tuned_KNN.score(X_trainss, y_trainss) + 
tuned_KNN.score(X_trains, y_trains)) / 3

0.955128205128205

## Wine Dataset SVM with various splits (20/80, 50/50, 80/20)
## 20/80 split

In [118]:
from sklearn.svm import SVC
wine_df_SVM = wine_df.sample(frac = 1).reset_index(drop = True)
wine_XS = wine_df_SVM[wine_df_SVM.columns[1:]].to_numpy()
wine_YS = wine_df_SVM[wine_df_SVM.columns[0]].to_numpy()
X_trainS, X_testS, y_trainS, y_testS = train_test_split(wine_XS, wine_YS, test_size=0.8, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
# scaler_newSVM = StandardScaler()
# scaler_newSVM.fit(X_trainS)
# X_train_SVM = scaler.transform(X_trainS)


# call classifier and get the best hyper parameter for this case - combinations of above params
classifiere = SVC(class_weight = 'balanced')
class_hyper_tunee = GridSearchCV(classifiere, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunee.fit(X_trainS, y_trainS)
print("training score for best hyperparameter " + str(class_hyper_tunee.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunee.cv_results_.get('mean_test_score')[0]))
class_hyper_tunee.best_estimator_

training score for best hyperparameter 0.9618736383442266
test score for best hyperparameter 0.9629629629629629


## Trial 1

In [119]:
tuned_SVM = SVC(C = 0.1, kernel = "linear", class_weight = 'balanced')
tuned_SVM.fit(X_trainS, y_trainS)
tuned_SVM.score(X_testS, y_testS)

0.9519230769230769

## Trial 2

In [120]:
wine_df_SVM1 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_XS1 = wine_df_SVM1[wine_df_SVM1.columns[1:]].to_numpy()
wine_YS1 = wine_df_SVM1[wine_df_SVM1.columns[0]].to_numpy()
X_trainS1, X_testS1, y_trainS1, y_testS1 = train_test_split(wine_XS1, wine_YS1, test_size=0.8, random_state=42)

tuned_SVM.fit(X_trainS1, y_trainS1)
tuned_SVM.score(X_testS1, y_testS1)

0.9038461538461539

## Trial 3

In [121]:
wine_df_SVM2 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_XS2 = wine_df_SVM2[wine_df_SVM2.columns[1:]].to_numpy()
wine_YS2 = wine_df_SVM2[wine_df_SVM2.columns[0]].to_numpy()
X_trainS2, X_testS2, y_trainS2, y_testS2 = train_test_split(wine_XS2, wine_YS2, test_size=0.8, random_state=42)

tuned_SVM.fit(X_trainS2, y_trainS2)
tuned_SVM.score(X_testS2, y_testS2)

0.8461538461538461

In [122]:
avg_20_test = (0.8461538461538461 + 0.9038461538461539 + 0.9519230769230769) / 3
avg_20_test

0.9006410256410255

In [123]:
(tuned_SVM.score(X_trainS2, y_trainS2) + 
tuned_SVM.score(X_trainS1, y_trainS1) + 
tuned_SVM.score(X_trainS, y_trainS)) / 3

0.923076923076923

## 50/50 split

In [124]:
wine_df_S = wine_df.sample(frac = 1).reset_index(drop = True)
wine_XSS = wine_df_S[wine_df_S.columns[1:]].to_numpy()
wine_YSS = wine_df_S[wine_df_S.columns[0]].to_numpy()
X_trainSS, X_testSS, y_trainSS, y_testSS = train_test_split(wine_XSS, wine_YSS, test_size=0.5, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
# scaler_newSVM = StandardScaler()
# scaler_newSVM.fit(X_trainS)
# X_train_SVM = scaler.transform(X_trainS)


# call classifier and get the best hyper parameter for this case - combinations of above params
classifiers = SVC(class_weight = 'balanced')
class_hyper_tunes = GridSearchCV(classifiers, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunes.fit(X_trainSS, y_trainSS)
print("training score for best hyperparameter " + str(class_hyper_tunes.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunes.cv_results_.get('mean_test_score')[0]))
class_hyper_tunes.best_estimator_

training score for best hyperparameter 0.961416490486258
test score for best hyperparameter 0.9227994227994228


## Trial 1

In [125]:
tuned_50 = SVC(class_weight = 'balanced', C = 1, kernel = 'linear')
tuned_50.fit(X_trainSS, y_trainSS)
tuned_50.score(X_testSS, y_testSS)

0.9692307692307692

## Trial 2

In [126]:
wine_df_T = wine_df.sample(frac = 1).reset_index(drop = True)
wine_XST = wine_df_T[wine_df_T.columns[1:]].to_numpy()
wine_YST = wine_df_T[wine_df_T.columns[0]].to_numpy()
X_trainST, X_testST, y_trainST, y_testST = train_test_split(wine_XST, wine_YST, test_size=0.5, random_state=42)

tuned_50.fit(X_trainST, y_trainST)
tuned_50.score(X_testST, y_testST)

0.9692307692307692

## Trial 3

In [127]:
wine_df_U = wine_df.sample(frac = 1).reset_index(drop = True)
wine_XSU = wine_df_U[wine_df_U.columns[1:]].to_numpy()
wine_YSU = wine_df_U[wine_df_U.columns[0]].to_numpy()
X_trainSU, X_testSU, y_trainSU, y_testSU = train_test_split(wine_XSU, wine_YSU, test_size=0.5, random_state=42)

tuned_50.fit(X_trainSU, y_trainSU)
tuned_50.score(X_testSU, y_testSU)

0.9692307692307692

In [128]:
avg_test_50 = (0.9692307692307692 + 0.96923076923076925 + 0.9692307692307692) / 3
avg_test_50

0.9692307692307692

In [129]:
(tuned_50.score(X_trainSU, y_trainSU) + 
 tuned_50.score(X_trainST, y_trainST) + 
tuned_50.score(X_trainSS, y_trainSS)) / 3

0.9897435897435898

## 80/20 split SVM

In [130]:
wine_df_80 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X80 = wine_df_80[wine_df_80.columns[1:]].to_numpy()
wine_Y80 = wine_df_S[wine_df_80.columns[0]].to_numpy()
X_train80, X_test80, y_train80, y_test80 = train_test_split(wine_X80, wine_Y80, test_size=0.2, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
# scaler_newSVM = StandardScaler()
# scaler_newSVM.fit(X_trainS)
# X_train_SVM = scaler.transform(X_trainS)


# call classifier and get the best hyper parameter for this case - combinations of above params
classifiers80 = SVC(class_weight = 'balanced')
class_hyper_tune80 = GridSearchCV(classifiers80, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tune80.fit(X_train80, y_train80)
print("training score for best hyperparameter " + str(class_hyper_tune80.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tune80.cv_results_.get('mean_test_score')[0]))
class_hyper_tune80.best_estimator_

training score for best hyperparameter 0.5381642512077295
test score for best hyperparameter 0.5372549019607843


## Trial 1

In [131]:
wine_df_801 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X801 = wine_df_801[wine_df_801.columns[1:]].to_numpy()
wine_Y801 = wine_df_801[wine_df_801.columns[0]].to_numpy()
X_train801, X_test801, y_train801, y_test801 = train_test_split(wine_X801, wine_Y801, test_size=0.2, random_state=42)

classifiers801 = SVC(class_weight = 'balanced')
class_hyper_tune801 = GridSearchCV(classifiers801, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tune801.fit(X_train801, y_train801)
print("training score for best hyperparameter " + str(class_hyper_tune801.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tune801.cv_results_.get('mean_test_score')[0]))
class_hyper_tune801.best_estimator_

training score for best hyperparameter 0.9373360938578329
test score for best hyperparameter 0.9319327731092436


In [132]:
tuned_8020 = SVC(class_weight = 'balanced', kernel = 'linear', C = 1)
tuned_8020.fit(X_train801, y_train801)
tuned_8020.score(X_test801, y_test801)

0.8846153846153846

## Trial 2

In [133]:
wine_df_802 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X802 = wine_df_802[wine_df_802.columns[1:]].to_numpy()
wine_Y802 = wine_df_802[wine_df_802.columns[0]].to_numpy()
X_train802, X_test802, y_train802, y_test802 = train_test_split(wine_X802, wine_Y802, test_size=0.2, random_state=42)

tuned_8020.fit(X_train802, y_train802)
tuned_8020.score(X_test802, y_test802)

0.9615384615384616

## Trial 3

In [134]:
wine_df_803 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X803 = wine_df_803[wine_df_803.columns[1:]].to_numpy()
wine_Y803 = wine_df_803[wine_df_803.columns[0]].to_numpy()
X_train803, X_test803, y_train803, y_test803 = train_test_split(wine_X803, wine_Y803, test_size=0.2, random_state=42)

tuned_8020.fit(X_train803, y_train803)
tuned_8020.score(X_test803, y_test803)

0.9615384615384616

In [135]:
avg_test_80 = (0.8846153846153846 + 0.9615384615384616 + 0.9615384615384616) / 3
avg_test_80

0.935897435897436

In [136]:
(tuned_8020.score(X_train803, y_train803) + 
 tuned_8020.score(X_train80, y_train80) + 
 tuned_8020.score(X_train801, y_train801)) / 3

0.826923076923077

## Logistic Regression with Breast Cancer Data

In [137]:
breast_cancer_df = breast_cancer_df[breast_cancer_df[breast_cancer_df.columns[6]] != '?']
breast_cancer_df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [138]:
breast_cancer_df.shape

(683, 11)

In [139]:
cancer_shuffle = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X = cancer_shuffle[cancer_shuffle.columns[1:10]].to_numpy()
bc_Y = cancer_shuffle[cancer_shuffle.columns[10]].to_numpy()

## 20/80 split

In [140]:
X_trainbc, X_testbc, y_trainbc, y_testbc = train_test_split(bc_X, bc_Y, test_size = 0.8, random_state = 42)
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
scaler = StandardScaler()
scaler.fit(X_trainbc)
X_train_newbc = scaler.transform(X_trainbc)

# call classifier and get the best hyper parameter for this case - combinations of above params
classi = LogisticRegression(class_weight = 'balanced')
class_hyper_tunei = GridSearchCV(classi, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunei.fit(X_train_newbc, y_trainbc)
print("training score for best hyperparameter " + str(class_hyper_tunei.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunei.cv_results_.get('mean_test_score')[0]))
class_hyper_tunei.best_estimator_

training score for best hyperparameter 0.9558811558811559
test score for best hyperparameter 0.955877616747182


## Trial 1

In [141]:
tuned_LR_BC = LogisticRegression(C = 1, class_weight = 'balanced', solver = 'liblinear')
tuned_LR_BC.fit(X_train_newbc, y_trainbc)
tuned_LR_BC.score(X_testbc, y_testbc)

0.340036563071298

## Trial 2

In [142]:
cancer_shuffle1 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X1 = cancer_shuffle1[cancer_shuffle1.columns[1:10]].to_numpy()
bc_Y1 = cancer_shuffle1[cancer_shuffle1.columns[10]].to_numpy()
X_trainbc1, X_testbc1, y_trainbc1, y_testbc1 = train_test_split(bc_X1, bc_Y1, test_size = 0.8, random_state = 42)

scaler1 = StandardScaler()
scaler.fit(X_trainbc1)
X_train_newbc1 = scaler.transform(X_trainbc1)

tuned_LR_BC.fit(X_train_newbc1, y_trainbc1)
tuned_LR_BC.score(X_testbc1, y_testbc1)

0.3692870201096892

## Trial 3

In [143]:
cancer_shuffle2 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X2 = cancer_shuffle2[cancer_shuffle2.columns[1:10]].to_numpy()
bc_Y2 = cancer_shuffle2[cancer_shuffle2.columns[10]].to_numpy()
X_trainbc2, X_testbc2, y_trainbc2, y_testbc2 = train_test_split(bc_X2, bc_Y2, test_size = 0.8, random_state = 42)

scaler2 = StandardScaler()
scaler.fit(X_trainbc2)
X_train_newbc2 = scaler.transform(X_trainbc2)

tuned_LR_BC.fit(X_train_newbc2, y_trainbc2)
tuned_LR_BC.score(X_testbc2, y_testbc2)

0.3546617915904936

In [144]:
avg_test_20 = (0.340036563071298 + 0.3692870201096892 + 0.3546617915904936) / 3
avg_test_20

0.35466179159049355

In [145]:
(tuned_LR_BC.score(X_train_newbc2, y_trainbc2) + 
tuned_LR_BC.score(X_train_newbc1, y_trainbc1) + 
tuned_LR_BC.score(X_train_newbc, y_trainbc)) / 3

0.9779411764705883

## 50/50 split

In [146]:
cancer_shuffleds = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xds = cancer_shuffleds[cancer_shuffleds.columns[1:10]].to_numpy()
bc_Yds = cancer_shuffleds[cancer_shuffleds.columns[10]].to_numpy()
X_trainbcds, X_testbcds, y_trainbcds, y_testbcds = train_test_split(bc_Xds, bc_Yds, test_size = 0.5, random_state = 42)
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
scaler50 = StandardScaler()
scaler50.fit(X_trainbcds)
X_train_newbcds = scaler.transform(X_trainbcds)

# call classifier and get the best hyper parameter for this case - combinations of above params
classier = LogisticRegression(class_weight = 'balanced')
classier_hyper_tune = GridSearchCV(classier, param_grid = params, cv = 3, return_train_score = True)
classier_hyper_tune.fit(X_trainbcds, y_trainbcds)
print("training score for best hyperparameter " + str(classier_hyper_tune.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(classier_hyper_tune.cv_results_.get('mean_test_score')[0]))
classier_hyper_tune.best_estimator_

training score for best hyperparameter 0.9574864105932966
test score for best hyperparameter 0.9589866997878177


## Trial 1

In [147]:
tuned_50_LR = LogisticRegression(C = 0.01, class_weight = 'balanced', solver = 'lbfgs')
tuned_50_LR.fit(X_trainbcds, y_trainbcds)
tuned_50_LR.score(X_testbcds, y_testbcds)

0.9824561403508771

## Trial 2

In [148]:
cancer_shuffled50s = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd50s = cancer_shuffled50s[cancer_shuffled50s.columns[1:10]].to_numpy()
bc_Yd50s = cancer_shuffled50s[cancer_shuffled50s.columns[10]].to_numpy()
X_train50s, X_test50s, y_train50s, y_test50s = train_test_split(bc_Xd50s, bc_Yd50s, test_size = 0.5, random_state = 42)

tuned_50_LR.fit(X_train50s, y_train50s)
tuned_50_LR.score(X_test50s, y_test50s)

0.9883040935672515

## Trial 3

In [149]:
cancer_shuffle5s = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X5s = cancer_shuffle5s[cancer_shuffle5s.columns[1:10]].to_numpy()
bc_Y5s = cancer_shuffle5s[cancer_shuffle5s.columns[10]].to_numpy()
X_train5s, X_test5s, y_train5s, y_test5s = train_test_split(bc_X5s, bc_Y5s, test_size = 0.5, random_state = 42)

tuned_50_LR.fit(X_train5s, y_train5s)
tuned_50_LR.score(X_test5s, y_test5s)

0.9590643274853801

In [150]:
avg_test_svm_50 = (0.9883040935672515 + 0.9824561403508771 + 0.9590643274853801) / 3
avg_test_svm_50

0.9766081871345028

In [151]:
(tuned_50_LR.score(X_train5s, y_train5s) + 
tuned_50_LR.score(X_train50s, y_train50s) + 
tuned_50_LR.score(X_trainbcds, y_trainbcds)) / 3

0.9618768328445748

## 80/20 split

In [152]:
cancer_shuffle20 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X20 = cancer_shuffle20[cancer_shuffle20.columns[1:10]].to_numpy()
bc_Y20 = cancer_shuffle20[cancer_shuffle20.columns[10]].to_numpy()
X_train20, X_test20, y_train20, y_test20 = train_test_split(bc_X20, bc_Y20, test_size = 0.2, random_state = 42)
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
scaler20 = StandardScaler()
scaler20.fit(X_train20)
X_train_new20 = scaler.transform(X_train20)

# call classifier and get the best hyper parameter for this case - combinations of above params
classier20 = LogisticRegression(class_weight = 'balanced')
classier20_hyper_tune = GridSearchCV(classier20, param_grid = params, cv = 3, return_train_score = True)
classier20_hyper_tune.fit(X_train20, y_train20)
print("training score for best hyperparameter " + str(classier20_hyper_tune.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(classier20_hyper_tune.cv_results_.get('mean_test_score')[0]))
classier20_hyper_tune.best_estimator_

training score for best hyperparameter 0.9734432234432234
test score for best hyperparameter 0.9725274725274725


## Trial 1

In [153]:
tuned_20_LR = LogisticRegression(C = 0.1, class_weight = 'balanced', solver = 'lbfgs')
tuned_20_LR.fit(X_train20, y_train20)
tuned_20_LR.score(X_test20, y_test20)

0.948905109489051

## Trial 2

In [154]:
cancer_shuffle21 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X21 = cancer_shuffle21[cancer_shuffle21.columns[1:10]].to_numpy()
bc_Y21 = cancer_shuffle21[cancer_shuffle21.columns[10]].to_numpy()
X_train21, X_test21, y_train21, y_test21 = train_test_split(bc_X21, bc_Y21, test_size = 0.2, random_state = 42)

tuned_20_LR.fit(X_train21, y_train21)
tuned_20_LR.score(X_test21, y_test21)

0.9927007299270073

## Trial 3

In [155]:
cancer_shuffle22 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X22 = cancer_shuffle22[cancer_shuffle22.columns[1:10]].to_numpy()
bc_Y22 = cancer_shuffle22[cancer_shuffle22.columns[10]].to_numpy()
X_train22, X_test22, y_train22, y_test22 = train_test_split(bc_X22, bc_Y22, test_size = 0.2, random_state = 42)

tuned_20_LR.fit(X_train22, y_train22)
tuned_20_LR.score(X_test22, y_test22)

0.9781021897810219

In [156]:
avg_test_80 = (0.948905109489051 + 0.9927007299270073 + 0.9781021897810219)/ 3
avg_test_80

0.9732360097323601

In [157]:
(tuned_20_LR.score(X_train22, y_train22) + 
 tuned_20_LR.score(X_train21, y_train21) + 
tuned_20_LR.score(X_train20, y_train20)) / 3

0.9719169719169719

## KNN with Beast Cancer Data with all splits
## 20/80

In [158]:
cancer_shuffled = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd = cancer_shuffled[cancer_shuffled.columns[1:10]].to_numpy()
bc_Yd = cancer_shuffled[cancer_shuffled.columns[10]].to_numpy()
X_traind, X_testd, y_traind, y_testd = train_test_split(bc_Xd, bc_Yd, test_size = 0.8, random_state = 42)

params = {'n_neighbors' : [3, 5, 7, 10]}

# have to scale data you ran into an error

# call classifier and get the best hyper parameter for this case - combinations of above params
classifierd = KNeighborsClassifier(weights = 'uniform')
class_hyper_tuned = GridSearchCV(classifierd, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuned.fit(X_traind, y_traind)
print("training score for best hyperparameter " + str(class_hyper_tuned.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuned.cv_results_.get('mean_test_score')[0]))
class_hyper_tuned.best_estimator_

training score for best hyperparameter 0.9816035816035816
test score for best hyperparameter 0.9632850241545894


## Trial 1

In [159]:
tuned_KNN = KNeighborsClassifier(n_neighbors = 3, weights = 'uniform')
tuned_KNN.fit(X_traind, y_traind)
tuned_KNN.score(X_testd, y_testd)

0.9634369287020109

## Trial 2

In [160]:
cancer_shuffled3 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd3 = cancer_shuffled3[cancer_shuffled3.columns[1:10]].to_numpy()
bc_Yd3 = cancer_shuffled3[cancer_shuffled3.columns[10]].to_numpy()
X_traind3, X_testd3, y_traind3, y_testd3 = train_test_split(bc_Xd3, bc_Yd3, test_size = 0.8, random_state = 42)

tuned_KNN.fit(X_traind3, y_traind3)
tuned_KNN.score(X_testd3, y_testd3)

0.9707495429616088

## Trial 3

In [161]:
cancer_shuffled4 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd4 = cancer_shuffled4[cancer_shuffled4.columns[1:10]].to_numpy()
bc_Yd4 = cancer_shuffled4[cancer_shuffled4.columns[10]].to_numpy()
X_traind4, X_testd4, y_traind4, y_testd4 = train_test_split(bc_Xd4, bc_Yd4, test_size = 0.8, random_state = 42)

tuned_KNN.fit(X_traind4, y_traind4)
tuned_KNN.score(X_testd4, y_testd4)

0.9725776965265083

In [162]:
avg_test_knn_20 = (0.9634369287020109 + 0.9707495429616088 + 0.9725776965265083) / 3
avg_test_knn_20

0.9689213893967094

In [163]:
(tuned_KNN.score(X_traind4, y_traind4) + 
tuned_KNN.score(X_traind3, y_traind3) + 
tuned_KNN.score(X_traind, y_traind)) / 3

0.9803921568627452

## 50/50 split
## Trial 1

In [164]:
cancer_shuffled4 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd4 = cancer_shuffled4[cancer_shuffled4.columns[1:10]].to_numpy()
bc_Yd4 = cancer_shuffled4[cancer_shuffled4.columns[10]].to_numpy()
X_traind4, X_testd4, y_traind4, y_testd4 = train_test_split(bc_Xd4, bc_Yd4, test_size = 0.5, random_state = 42)

params = {'n_neighbors' : [3, 5, 7, 10]}

# have to scale data you ran into an error

# call classifier and get the best hyper parameter for this case - combinations of above params
classifier4 = KNeighborsClassifier(weights = 'uniform')
class_hyper_tuned4 = GridSearchCV(classifier4, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuned4.fit(X_traind4, y_traind4)
print("training score for best hyperparameter " + str(class_hyper_tuned4.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuned4.cv_results_.get('mean_test_score')[0]))
class_hyper_tuned4.best_estimator_

training score for best hyperparameter 0.9750882345364144
test score for best hyperparameter 0.9354913833255706


In [165]:
tuned_50 = KNeighborsClassifier(n_neighbors = 3, weights ='uniform')
tuned_50.fit(X_traind4, y_traind4)
tuned_50.score(X_testd4, y_testd4)

0.9736842105263158

## Trial 2

In [166]:
cancer_shuffled5 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd5 = cancer_shuffled5[cancer_shuffled5.columns[1:10]].to_numpy()
bc_Yd5 = cancer_shuffled5[cancer_shuffled5.columns[10]].to_numpy()
X_traind5, X_testd5, y_traind5, y_testd5 = train_test_split(bc_Xd5, bc_Yd5, test_size = 0.5, random_state = 42)

tuned_50.fit(X_traind5, y_traind5)
tuned_50.score(X_testd5, y_testd5)

0.9736842105263158

## Trial 3

In [167]:
cancer_shuffled6 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd6 = cancer_shuffled6[cancer_shuffled6.columns[1:10]].to_numpy()
bc_Yd6 = cancer_shuffled6[cancer_shuffled6.columns[10]].to_numpy()
X_traind6, X_testd6, y_traind6, y_testd6 = train_test_split(bc_Xd6, bc_Yd6, test_size = 0.5, random_state = 42)

tuned_50.fit(X_traind6, y_traind6)
tuned_50.score(X_testd6, y_testd6)

0.9619883040935673

In [168]:
avg_test_50_knn = (0.9736842105263158 + 0.9736842105263158 + 0.9619883040935673) / 3
avg_test_50_knn

0.969785575048733

In [169]:
(tuned_50.score(X_traind6, y_traind6) + 
tuned_50.score(X_traind5, y_traind5) + 
tuned_50.score(X_traind4, y_traind4)) / 3

0.9716520039100685

## 80/20 split

In [170]:
cancer_shuffled8 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd8 = cancer_shuffled8[cancer_shuffled8.columns[1:10]].to_numpy()
bc_Yd8 = cancer_shuffled8[cancer_shuffled8.columns[10]].to_numpy()
X_traind8, X_testd8, y_traind8, y_testd8 = train_test_split(bc_Xd8, bc_Yd8, test_size = 0.2, random_state = 42)

params = {'n_neighbors' : [3, 5, 7, 10]}

# have to scale data you ran into an error

# call classifier and get the best hyper parameter for this case - combinations of above params
classifier8 = KNeighborsClassifier(weights = 'uniform')
class_hyper_tuned8 = GridSearchCV(classifier8, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuned8.fit(X_traind8, y_traind8)
print("training score for best hyperparameter " + str(class_hyper_tuned8.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuned8.cv_results_.get('mean_test_score')[0]))
class_hyper_tuned8.best_estimator_

training score for best hyperparameter 0.9798534798534798
test score for best hyperparameter 0.967032967032967


## Trial 1

In [171]:
tuned_20 = KNeighborsClassifier(n_neighbors = 5, weights = 'uniform')
tuned_20.fit(X_traind8, y_traind8)
tuned_20.score(X_testd8, y_testd8)

0.9635036496350365

## Trial 2

In [172]:
cancer_shuffled9 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd9 = cancer_shuffled9[cancer_shuffled9.columns[1:10]].to_numpy()
bc_Yd9 = cancer_shuffled9[cancer_shuffled9.columns[10]].to_numpy()
X_traind9, X_testd9, y_traind9, y_testd9 = train_test_split(bc_Xd9, bc_Yd9, test_size = 0.2, random_state = 42)

tuned_20.fit(X_traind9, y_traind9)
tuned_20.score(X_testd9, y_testd9)

0.9854014598540146

## Trial 3

In [173]:
cancer_shuffled10 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd10 = cancer_shuffled10[cancer_shuffled10.columns[1:10]].to_numpy()
bc_Yd10 = cancer_shuffled10[cancer_shuffled10.columns[10]].to_numpy()
X_traind10, X_testd10, y_traind10, y_testd10 = train_test_split(bc_Xd10, bc_Yd10, test_size = 0.2, random_state = 42)

tuned_20.fit(X_traind10, y_traind10)
tuned_20.score(X_testd10, y_testd10)

0.9708029197080292

In [174]:
avg_knn_80_avg = (0.9635036496350365 + 0.9854014598540146 + 0.9708029197080292) / 3
avg_knn_80_avg

0.9732360097323601

In [175]:
(tuned_20.score(X_traind10, y_traind10) + 
tuned_20.score(X_traind9, y_traind9) + 
tuned_20.score(X_traind8, y_traind8)) / 3

0.9755799755799756

## SVM with Breast Cancer Dataset
## 80/20 split

In [176]:
breast_cancer_df80 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X80 = breast_cancer_df80[breast_cancer_df80.columns[1:10]].to_numpy()
breast_Y80 = breast_cancer_df80[breast_cancer_df80.columns[10]].to_numpy()
X_trainb80, X_testb80, y_trainb80, y_testb80 = train_test_split(breast_X80, breast_Y80, test_size=0.2, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
# scaler_newSVM = StandardScaler()
# scaler_newSVM.fit(X_trainS)
# X_train_SVM = scaler.transform(X_trainS)


# call classifier and get the best hyper parameter for this case - combinations of above params
classifierb80 = SVC(class_weight = 'balanced')
class_hyper_tuneb80 = GridSearchCV(classifierb80, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuneb80.fit(X_trainb80, y_trainb80)
print("training score for best hyperparameter " + str(class_hyper_tuneb80.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuneb80.cv_results_.get('mean_test_score')[0]))
class_hyper_tuneb80.best_estimator_

training score for best hyperparameter 0.9652014652014652
test score for best hyperparameter 0.9615384615384616


In [177]:
tuned_SVC80 = SVC(C = 0.01, class_weight = 'balanced', kernel = 'linear')
tuned_SVC80.fit(X_trainb80, y_trainb80)
tuned_SVC80.score(X_testb80, y_testb80)

1.0

## Trial 2

In [178]:
breast_cancer_df801 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X801 = breast_cancer_df801[breast_cancer_df801.columns[1:10]].to_numpy()
breast_Y801 = breast_cancer_df801[breast_cancer_df801.columns[10]].to_numpy()
X_trainb801, X_testb801, y_trainb801, y_testb801 = train_test_split(breast_X801, breast_Y801, test_size=0.2, random_state=42)
tuned_SVC80.fit(X_trainb801, y_trainb801)
tuned_SVC80.score(X_testb801, y_testb801)

0.948905109489051

## Trial 3

In [179]:
breast_cancer_df802 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X802 = breast_cancer_df802[breast_cancer_df802.columns[1:10]].to_numpy()
breast_Y802 = breast_cancer_df802[breast_cancer_df802.columns[10]].to_numpy()
X_trainb802, X_testb802, y_trainb802, y_testb802 = train_test_split(breast_X802, breast_Y802, test_size=0.2, random_state=42)
tuned_SVC80.fit(X_trainb802, y_trainb802)
tuned_SVC80.score(X_testb802, y_testb802)

0.9562043795620438

In [180]:
avg_svm_test_20 = (1.0 + 0.948905109489051 + 0.9562043795620438) / 3
avg_svm_test_20

0.9683698296836983

In [181]:
(tuned_SVC80.score(X_trainb802, y_trainb802) + 
tuned_SVC80.score(X_trainb801, y_trainb801) + 
tuned_SVC80.score(X_trainb80, y_trainb80)) / 3

0.9731379731379731

## 50/50 split

In [182]:
breast_cancer_df805 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X805 = breast_cancer_df805[breast_cancer_df805.columns[1:10]].to_numpy()
breast_Y805 = breast_cancer_df805[breast_cancer_df805.columns[10]].to_numpy()
X_trainb805, X_testb805, y_trainb805, y_testb805 = train_test_split(breast_X805, breast_Y805, test_size=0.5, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
# scaler_newSVM = StandardScaler()
# scaler_newSVM.fit(X_trainS)
# X_train_SVM = scaler.transform(X_trainS)


# call classifier and get the best hyper parameter for this case - combinations of above params
classifierb805 = SVC(class_weight = 'balanced')
class_hyper_tuneb805 = GridSearchCV(classifierb805, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuneb805.fit(X_trainb805, y_trainb805)
print("training score for best hyperparameter " + str(class_hyper_tuneb805.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuneb805.cv_results_.get('mean_test_score')[0]))
class_hyper_tuneb805.best_estimator_

training score for best hyperparameter 0.9648027925908752
test score for best hyperparameter 0.964782901205817


## Trial 1

In [183]:
#classifier
tuned_SVC801 = SVC(C = 1, class_weight = 'balanced', kernel = 'rbf')
tuned_SVC801.fit(X_trainb805, y_trainb805)
tuned_SVC801.score(X_testb805, y_testb805)

0.9766081871345029

## Trial 2

In [184]:
breast_cancer_df806 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X806 = breast_cancer_df806[breast_cancer_df806.columns[1:10]].to_numpy()
breast_Y806 = breast_cancer_df806[breast_cancer_df806.columns[10]].to_numpy()
X_trainb806, X_testb806, y_trainb806, y_testb806 = train_test_split(breast_X806, breast_Y806, test_size=0.5, random_state=42)

tuned_SVC801.fit(X_trainb806, y_trainb806)
tuned_SVC801.score(X_testb806, y_testb806)

0.9707602339181286

## Trial 3

In [185]:
breast_cancer_df807 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X807 = breast_cancer_df807[breast_cancer_df807.columns[1:10]].to_numpy()
breast_Y807 = breast_cancer_df807[breast_cancer_df807.columns[10]].to_numpy()
X_trainb807, X_testb807, y_trainb807, y_testb807 = train_test_split(breast_X807, breast_Y807, test_size=0.5, random_state=42)

tuned_SVC801.fit(X_trainb807, y_trainb807)
tuned_SVC801.score(X_testb807, y_testb807)

0.9707602339181286

In [186]:
avg_50_svm_test = (0.9707602339181286 + 0.9766081871345029 + 0.9707602339181286) / 3
avg_50_svm_test

0.9727095516569201

In [187]:
(tuned_SVC801.score(X_trainb807, y_trainb807) +
tuned_SVC801.score(X_trainb806, y_trainb806) + 
tuned_SVC801.score(X_trainb805, y_trainb805)
) / 3

0.9657869012707723

## 20/80 split

In [188]:
breast_cancer_df808 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X808 = breast_cancer_df808[breast_cancer_df808.columns[1:10]].to_numpy()
breast_Y808 = breast_cancer_df808[breast_cancer_df808.columns[10]].to_numpy()
X_trainb808, X_testb808, y_trainb808, y_testb808 = train_test_split(breast_X808, breast_Y808, test_size=0.8, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
# scaler_newSVM = StandardScaler()
# scaler_newSVM.fit(X_trainS)
# X_train_SVM = scaler.transform(X_trainS)


# call classifier and get the best hyper parameter for this case - combinations of above params
classifierb808 = SVC(class_weight = 'balanced')
class_hyper_tuneb808 = GridSearchCV(classifierb808, param_grid = params, cv = 2, return_train_score = True)
class_hyper_tuneb808.fit(X_trainb808, y_trainb808)
print("training score for best hyperparameter " + str(class_hyper_tuneb808.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuneb808.cv_results_.get('mean_test_score')[0]))
class_hyper_tuneb808.best_estimator_

training score for best hyperparameter 0.9705882352941176
test score for best hyperparameter 0.9705882352941176


## Trial 1

In [189]:
tuned_SVC808 = SVC(C = 0.001, class_weight = 'balanced', kernel = 'linear')
tuned_SVC808.fit(X_trainb808, y_trainb808)
tuned_SVC808.score(X_testb808, y_testb808)

0.9634369287020109

## Trial 2

In [190]:
breast_cancer_df809 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X809 = breast_cancer_df809[breast_cancer_df809.columns[1:10]].to_numpy()
breast_Y809 = breast_cancer_df809[breast_cancer_df809.columns[10]].to_numpy()
X_trainb809, X_testb809, y_trainb809, y_testb809 = train_test_split(breast_X809, breast_Y809, test_size=0.8, random_state=42)

tuned_SVC808.fit(X_trainb809, y_trainb809)
tuned_SVC808.score(X_testb809, y_testb809)

0.9616087751371115

## Trial 3

In [191]:
breast_cancer_df810 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X810 = breast_cancer_df810[breast_cancer_df810.columns[1:10]].to_numpy()
breast_Y810 = breast_cancer_df810[breast_cancer_df810.columns[10]].to_numpy()
X_trainb810, X_testb810, y_trainb810, y_testb810 = train_test_split(breast_X810, breast_Y810, test_size=0.8, random_state=42)

tuned_SVC808.fit(X_trainb810, y_trainb810)
tuned_SVC808.score(X_testb810, y_testb810)

0.9652650822669104

In [192]:
avg_test_80_svm = (0.9616087751371115 + 0.9634369287020109 + 0.9652650822669104) / 3
avg_test_80_svm

0.963436928702011

In [193]:
(tuned_SVC808.score(X_trainb809, y_trainb809) + 
tuned_SVC808.score(X_trainb810, y_trainb810) + 
tuned_SVC808.score(X_trainb808, y_trainb808)) / 3

0.9681372549019608

## Logistic Regression on Student data with splits (20/80, 50/50, 80/20)


In [194]:
student_df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,subject
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,6,5,6,6,Math
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,4,5,5,6,Math
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,10,7,8,10,Math
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,2,15,14,15,Math
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,4,6,10,10,Math
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039,MS,F,19,R,GT3,T,2,3,services,other,...,4,2,1,2,5,4,10,11,10,Portugese
1040,MS,F,18,U,LE3,T,3,1,teacher,services,...,3,4,1,1,1,4,15,15,16,Portugese
1041,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,5,6,11,12,9,Portugese
1042,MS,M,17,U,LE3,T,3,1,services,services,...,4,5,3,4,2,6,10,10,10,Portugese


In [195]:
student_shuffle = student_df.sample(frac = 1).reset_index(drop = True)
X = student_shuffle.drop(['G3'], axis=1).select_dtypes(include='number')
y = student_shuffle['G3'].apply(lambda x: 1 if x >= 10 else 0)

## 80/20 split

In [196]:
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Set up hyperparameter grid
params = {'C': [0.001, 0.01, 0.1, 1, 10], 'solver': ['lbfgs', 'liblinear', 'newton-cg']}
classifier = LogisticRegression(class_weight='balanced')
grid_search = GridSearchCV(classifier, param_grid=params, cv=3, return_train_score=True)
grid_search.fit(X_train_scaled, y_train)
print("training score for best hyperparameter " + str(grid_search.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(grid_search.cv_results_.get('mean_test_score')[0]))
grid_search.best_estimator_

training score for best hyperparameter 0.8616733184368125
test score for best hyperparameter 0.8634984485529856


## Trial 1

In [197]:
tuned_ST = LogisticRegression(C = 1, class_weight = 'balanced', solver = 'liblinear')
tuned_ST.fit(X_train, y_train)
tuned_ST.score(X_test, y_test)

0.8708133971291866

## Trial 2

In [200]:
student_shuffle1 = student_df.sample(frac = 1).reset_index(drop = True)
st_X1 = student_shuffle1.drop(['G3'], axis=1).select_dtypes(include='number').to_numpy()
st_y1 = student_shuffle1['G3'].apply(lambda x: 1 if x >= 10 else 0).to_numpy()
X_trainst1, X_testst1, y_trainst1, y_testst1 = train_test_split(st_X1, st_y1, test_size = 0.2, random_state = 42)

scaler1 = StandardScaler()
scaler.fit(X_trainst1)
X_trainst_new1 = scaler.transform(X_trainst1)

tuned_ST.fit(X_trainst_new1, y_trainst1)
tuned_ST.score(X_testst1, y_testst1)

0.7703349282296651

## Trial 3

In [201]:
student_shuffle2 = student_df.sample(frac = 1).reset_index(drop = True)
st_X2 = student_shuffle2.drop(['G3'], axis=1).select_dtypes(include='number').to_numpy()
st_y2 = student_shuffle2['G3'].apply(lambda x: 1 if x >= 10 else 0).to_numpy()
X_trainst, X_testst, y_trainst, y_testst = train_test_split(st_X2, st_y2, test_size = 0.2, random_state = 42)

scaler1 = StandardScaler()
scaler.fit(X_trainst)
X_trainst_new = scaler.transform(X_trainst)

tuned_ST.fit(X_trainst_new, y_trainst)
tuned_ST.score(X_testst, y_testst)

0.8181818181818182

In [204]:
avg_st_test_20 = (0.8708133971291866 + 0.7703349282296651 + 0.8181818181818182) / 3
avg_st_test_20

0.8197767145135567

In [205]:
(tuned_ST.score(X_trainst, y_trainst) + 
tuned_ST.score(X_trainst1, y_trainst1) + 
tuned_ST.score(X_train, y_train)) / 3



0.7756487025948106

## 50/50 split

In [209]:
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Set up hyperparameter grid
params = {'C': [0.001, 0.01, 0.1, 1, 10], 'solver': ['lbfgs', 'liblinear', 'newton-cg']}
classifier = LogisticRegression(class_weight='balanced')
grid_search = GridSearchCV(classifier, param_grid=params, cv=3, return_train_score=True)
grid_search.fit(X_train_scaled, y_train)
print("training score for best hyperparameter " + str(grid_search.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(grid_search.cv_results_.get('mean_test_score')[0]))
grid_search.best_estimator_

training score for best hyperparameter 0.8630268199233716
test score for best hyperparameter 0.8620689655172414


## Trial 1

In [210]:
tuned_ST = LogisticRegression(C = 1, class_weight = 'balanced', solver = 'liblinear')
tuned_ST.fit(X_train, y_train)
tuned_ST.score(X_test, y_test)

0.8812260536398467

## Trial 2

In [212]:
student_shuffle1 = student_df.sample(frac = 1).reset_index(drop = True)
st_X1 = student_shuffle1.drop(['G3'], axis=1).select_dtypes(include='number').to_numpy()
st_y1 = student_shuffle1['G3'].apply(lambda x: 1 if x >= 10 else 0).to_numpy()
X_trainst, X_testst, y_trainst, y_testst = train_test_split(st_X1, st_y1, test_size = 0.5, random_state = 42)

scaler1 = StandardScaler()
scaler.fit(X_trainst)
X_trainst_new = scaler.transform(X_trainst)

tuned_ST.fit(X_trainst_new, y_trainst)
tuned_ST.score(X_testst, y_testst)

0.7835249042145593

## Trial 3

In [214]:
student_shuffle2 = student_df.sample(frac = 1).reset_index(drop = True)
st_X2 = student_shuffle2.drop(['G3'], axis=1).select_dtypes(include='number').to_numpy()
st_y2 = student_shuffle2['G3'].apply(lambda x: 1 if x >= 10 else 0).to_numpy()
X_trainst1, X_testst1, y_trainst1, y_testst1 = train_test_split(st_X2, st_y2, test_size = 0.5, random_state = 42)

scaler1 = StandardScaler()
scaler.fit(X_trainst1)
X_trainst_new1 = scaler.transform(X_trainst1)

tuned_ST.fit(X_trainst_new1, y_trainst1)
tuned_ST.score(X_testst1, y_testst1)

0.7758620689655172

In [217]:
avg_50_log = (0.7758620689655172 + 0.7835249042145593 + 0.8812260536398467) / 3
avg_50_log

0.813537675606641

In [218]:
(tuned_ST.score(X_trainst, y_trainst) + 
tuned_ST.score(X_trainst1, y_trainst1) + 
tuned_ST.score(X_train, y_train)) / 3



0.7784163473818646

## 20/80 Split

In [222]:
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Set up hyperparameter grid
params = {'C': [0.001, 0.01, 0.1, 1, 10], 'solver': ['lbfgs', 'liblinear', 'newton-cg']}
classifier = LogisticRegression(class_weight='balanced')
grid_search = GridSearchCV(classifier, param_grid=params, cv=3, return_train_score=True)
grid_search.fit(X_train_scaled, y_train)
print("training score for best hyperparameter " + str(grid_search.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(grid_search.cv_results_.get('mean_test_score')[0]))
grid_search.best_estimator_

training score for best hyperparameter 0.8509366419907552
test score for best hyperparameter 0.851000690131125


## Trial 1

In [223]:
tuned_ST = LogisticRegression(C = 1, class_weight = 'balanced', solver = 'liblinear')
tuned_ST.fit(X_train, y_train)
tuned_ST.score(X_test, y_test)

0.881578947368421

## Trial 2

In [226]:
student_shuffle1 = student_df.sample(frac = 1).reset_index(drop = True)
st_X1 = student_shuffle1.drop(['G3'], axis=1).select_dtypes(include='number').to_numpy()
st_y1 = student_shuffle1['G3'].apply(lambda x: 1 if x >= 10 else 0).to_numpy()
X_trainst, X_testst, y_trainst, y_testst = train_test_split(st_X1, st_y1, test_size = 0.8, random_state = 42)

scaler1 = StandardScaler()
scaler.fit(X_trainst)
X_trainst_new = scaler.transform(X_trainst)

tuned_ST.fit(X_trainst_new, y_trainst)
tuned_ST.score(X_testst, y_testst)

0.784688995215311

## Trial 3

In [227]:
student_shuffle2 = student_df.sample(frac = 1).reset_index(drop = True)
st_X2 = student_shuffle2.drop(['G3'], axis=1).select_dtypes(include='number').to_numpy()
st_y2 = student_shuffle2['G3'].apply(lambda x: 1 if x >= 10 else 0).to_numpy()
X_trainst1, X_testst1, y_trainst1, y_testst1 = train_test_split(st_X2, st_y2, test_size = 0.8, random_state = 42)

scaler1 = StandardScaler()
scaler.fit(X_trainst1)
X_trainst_new1 = scaler.transform(X_trainst1)

tuned_ST.fit(X_trainst_new1, y_trainst1)
tuned_ST.score(X_testst1, y_testst1)

0.7906698564593302

In [228]:
avg_test_80 = (0.881578947368421 + 0.784688995215311 + 0.7906698564593302) / 3
avg_test_80

0.8189792663476873

In [229]:
(tuned_ST.score(X_trainst, y_trainst) + 
tuned_ST.score(X_trainst1, y_trainst1) + 
tuned_ST.score(X_train, y_train)) / 3



0.7628205128205128

## KNN with Student Data
## 80/20 split

In [295]:
student_shuffled = student_df.sample(frac = 1).reset_index(drop = True)
X = student_shuffled.drop(['G3'], axis=1).select_dtypes(include='number').to_numpy()
y = student_shuffled['G3'].apply(lambda x: 1 if x >= 10 else 0).to_numpy()

## Trial 1

In [296]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
params = {'n_neighbors': [3, 5, 7, 9, 11]}
# Step 5: Set up hyperparameter grid
classifierd = KNeighborsClassifier(weights = 'uniform')
grid_search = GridSearchCV(classifierd, param_grid=params, cv=3, return_train_score=True)
grid_search.fit(X_train, y_train)
print("training score for best hyperparameter " + str(grid_search.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(grid_search.cv_results_.get('mean_test_score')[0]))
grid_search.best_estimator_

training score for best hyperparameter 0.9401233053054864
test score for best hyperparameter 0.8922324505986609


In [297]:
tuned_KNN = KNeighborsClassifier(n_neighbors=3, weights='uniform')
tuned_KNN.fit(X_train, y_train)
tuned_KNN.score(X_test, y_test)

0.9138755980861244

## Trial 2

In [298]:
student_shuffled1 = student_df.sample(frac = 1).reset_index(drop = True)
st_X1 = student_shuffle1.drop(['G3'], axis=1).select_dtypes(include='number')
st_y = student_shuffled1['G3'].apply(lambda x: 1 if x >= 10 else 0)
X_trainst, X_testst, y_trainst, y_testst = train_test_split(st_X1, st_y1, test_size = 0.2, random_state = 42)
tuned_KNN.fit(X_trainst, y_trainst)
tuned_KNN.score(X_testst, y_testst)

0.9138755980861244

## Trial 3

In [299]:
student_shuffle2 = student_df.sample(frac = 1).reset_index(drop = True)
st_X2 = student_shuffle2.drop(['G3'], axis=1).select_dtypes(include='number')
st_y2 = student_shuffle2['G3'].apply(lambda x: 1 if x >= 10 else 0)
X_trainst1, X_testst1, y_trainst1, y_testst1 = train_test_split(st_X2, st_y2, test_size = 0.2, random_state = 42)
tuned_KNN.fit(X_trainst1, y_trainst1)
tuned_KNN.score(X_testst1, y_testst1)

0.8995215311004785

In [300]:
test_avg_knn_20 = (0.8995215311004785 + 0.9138755980861244 + 0.9138755980861244) / 3
test_avg_knn_20

0.9090909090909092

In [301]:
(tuned_ST.score(X_trainst, y_trainst) + 
tuned_ST.score(X_trainst1, y_trainst1) + 
tuned_ST.score(X_train, y_train)) / 3



0.7708582834331338

## 50/50 split

In [288]:
student_shuffled = student_df.sample(frac = 1).reset_index(drop = True)
X = student_shuffled.drop(['G3'], axis=1).select_dtypes(include='number').to_numpy()
y = student_shuffled['G3'].apply(lambda x: 1 if x >= 10 else 0).to_numpy()

## Trial 1

In [289]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
params = {'n_neighbors': [3, 5, 7, 9, 11]}
# Step 5: Set up hyperparameter grid
classifierd = KNeighborsClassifier(weights = 'uniform')
grid_search = GridSearchCV(classifierd, param_grid=params, cv=3, return_train_score=True)
grid_search.fit(X_train, y_train)
print("training score for best hyperparameter " + str(grid_search.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(grid_search.cv_results_.get('mean_test_score')[0]))
grid_search.best_estimator_

training score for best hyperparameter 0.9482758620689656
test score for best hyperparameter 0.9022988505747126


In [290]:
tuned_KNN = KNeighborsClassifier(n_neighbors=3, weights='uniform')
tuned_KNN.fit(X_train, y_train)
tuned_KNN.score(X_test, y_test)

0.9003831417624522

## Trial 2

In [291]:
student_shuffled1 = student_df.sample(frac = 1).reset_index(drop = True)
st_X1 = student_shuffle1.drop(['G3'], axis=1).select_dtypes(include='number')
st_y = student_shuffle1['G3'].apply(lambda x: 1 if x >= 10 else 0)
X_trainst, X_testst, y_trainst, y_testst = train_test_split(st_X1, st_y1, test_size = 0.5, random_state = 42)
tuned_KNN.fit(X_trainst, y_trainst)
tuned_KNN.score(X_testst, y_testst)

0.8946360153256705

## Trial 3

In [292]:
student_shuffle2 = student_df.sample(frac = 1).reset_index(drop = True)
st_X2 = student_shuffle2.drop(['G3'], axis=1).select_dtypes(include='number')
st_y2 = student_shuffle2['G3'].apply(lambda x: 1 if x >= 10 else 0)
X_trainst1, X_testst1, y_trainst1, y_testst1 = train_test_split(st_X2, st_y2, test_size = 0.5, random_state = 42)
tuned_KNN.fit(X_trainst1, y_trainst1)
tuned_KNN.score(X_testst1, y_testst1)

0.8984674329501916

In [293]:
test_avg_knn_50 = (0.8946360153256705 + 0.9003831417624522 + 0.8984674329501916) / 3
test_avg_knn_50

0.8978288633461048

In [294]:
(tuned_ST.score(X_trainst, y_trainst) + 
tuned_ST.score(X_trainst1, y_trainst1) + 
tuned_ST.score(X_train, y_train)) / 3



0.7841634738186462

## Split 20/80

In [245]:
student_shuffled = student_df.sample(frac = 1).reset_index(drop = True)
X = student_shuffle.drop(['G3'], axis=1).select_dtypes(include='number').to_numpy()
y = student_shuffle['G3'].apply(lambda x: 1 if x >= 10 else 0).to_numpy()

## Trial 1

In [246]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
params = {'n_neighbors': [3, 5, 7, 9, 11]}
# Step 5: Set up hyperparameter grid
classifierd = KNeighborsClassifier(weights = 'uniform')
grid_search = GridSearchCV(classifierd, param_grid=params, cv=3, return_train_score=True)
grid_search.fit(X_train, y_train)
print("training score for best hyperparameter " + str(grid_search.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(grid_search.cv_results_.get('mean_test_score')[0]))
grid_search.best_estimator_

training score for best hyperparameter 0.949518645952803
test score for best hyperparameter 0.9038647342995169


In [247]:
tuned_KNN = KNeighborsClassifier(n_neighbors=3, weights='uniform')
tuned_KNN.fit(X_train, y_train)
tuned_KNN.score(X_test, y_test)

0.8947368421052632

## Trial 2

In [248]:
student_shuffled1 = student_df.sample(frac = 1).reset_index(drop = True)
st_X1 = student_shuffle1.drop(['G3'], axis=1).select_dtypes(include='number')
st_y = student_shuffle1['G3'].apply(lambda x: 1 if x >= 10 else 0)
X_trainst, X_testst, y_trainst, y_testst = train_test_split(st_X1, st_y1, test_size = 0.8, random_state = 42)
tuned_KNN.fit(X_trainst, y_trainst)
tuned_KNN.score(X_testst, y_testst)

0.8935406698564593

## Trial 3

In [249]:
student_shuffle2 = student_df.sample(frac = 1).reset_index(drop = True)
st_X2 = student_shuffle2.drop(['G3'], axis=1).select_dtypes(include='number')
st_y2 = student_shuffle2['G3'].apply(lambda x: 1 if x >= 10 else 0)
X_trainst1, X_testst1, y_trainst1, y_testst1 = train_test_split(st_X2, st_y2, test_size = 0.8, random_state = 42)
tuned_KNN.fit(X_trainst1, y_trainst1)
tuned_KNN.score(X_testst1, y_testst1)

0.8744019138755981

In [250]:
test_avg_knn_80 = (0.8947368421052632 + 0.8744019138755981 + 0.8935406698564593) / 3
test_avg_knn_80

0.8875598086124402

In [251]:
(tuned_ST.score(X_trainst, y_trainst) + 
tuned_ST.score(X_trainst1, y_trainst1) + 
tuned_ST.score(X_train, y_train)) / 3



0.7756410256410257

## SVM with student Data with 80/20, 50/50 and 20/80 splits
## 80/20 split

In [252]:
student_shuffled2 = student_df.sample(frac=1).reset_index(drop=True)
st_X2 = student_shuffled2.drop(['G3'], axis=1).select_dtypes(include='number').to_numpy()
st_y2 = student_shuffled2['G3'].apply(lambda x: 1 if x >= 10 else 0).to_numpy()

## Trial 1

In [253]:
X_trainst1, X_testst1, y_trainst1, y_testst1 = train_test_split(st_X2, st_y2, test_size=0.2, random_state=42)
params = {'C': [0.1, 1, 10, 100], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
classifierSVM = SVC(class_weight = 'balanced')
class_hyper_tuneSVM = GridSearchCV(classifierSVM, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuneSVM.fit(X_trainst1, y_trainst1)
print("training score for best hyperparameter " + str(class_hyper_tuneSVM.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuneSVM.cv_results_.get('mean_test_score')[0]))
class_hyper_tuneSVM.best_estimator_

training score for best hyperparameter 0.9119662976979278
test score for best hyperparameter 0.9077684089287711


In [254]:
tuned_param = SVC(class_weight = 'balanced', C = 0.1, kernel = 'linear')
tuned_param.fit(X_trainst1, y_trainst1)
tuned_param.score(X_testst1, y_testst1)

0.8660287081339713

## Trial 2

In [255]:
student_shuffled = student_df.sample(frac=1).reset_index(drop=True)
st_X = student_shuffled.drop(['G3'], axis=1).select_dtypes(include='number')
st_y = student_shuffled['G3'].apply(lambda x: 1 if x >= 10 else 0)
X_trainst, X_testst, y_trainst, y_testst = train_test_split(st_X, st_y, test_size=0.2, random_state=42)
tuned_param.fit(X_trainst, y_trainst)
tuned_param.score(X_testst, y_testst)

0.9043062200956937

## Trial 3

In [256]:
student_shuff = student_df.sample(frac=1).reset_index(drop=True)
st_X = student_shuff.drop(['G3'], axis=1).select_dtypes(include='number')
st_y = student_shuff['G3'].apply(lambda x: 1 if x >= 10 else 0)
X_train, X_test, y_train, y_test = train_test_split(st_X, st_y, test_size=0.2, random_state=42)
tuned_param.fit(X_train, y_train)
tuned_param.score(X_test, y_test)

0.9043062200956937

In [257]:
avg_svm_20 = (0.8660287081339713 + 0.9043062200956937 + 0.9043062200956937) / 3
avg_svm_20

0.8915470494417862

In [258]:
(tuned_param.score(X_trainst, y_trainst) + 
tuned_ST.score(X_trainst1, y_trainst1) + 
tuned_ST.score(X_train, y_train)) / 3



0.8207584830339322

## 50/50 split

In [281]:
student_shuffled = student_df.sample(frac=1).reset_index(drop=True)
st_X2 = student_shuffled.drop(['G3'], axis=1).select_dtypes(include='number').to_numpy()
st_y2 = student_shuffled['G3'].apply(lambda x: 1 if x >= 10 else 0).to_numpy()

## Trial 1

In [282]:
X_trainst1, X_testst1, y_trainst1, y_testst1 = train_test_split(st_X2, st_y2, test_size=0.5, random_state=42)
params = {'C': [0.1, 1, 10, 100], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
classifierSVM = SVC(class_weight = 'balanced')
class_hyper_tuneSVM = GridSearchCV(classifierSVM, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuneSVM.fit(X_trainst1, y_trainst1)
print("training score for best hyperparameter " + str(class_hyper_tuneSVM.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuneSVM.cv_results_.get('mean_test_score')[0]))
class_hyper_tuneSVM.best_estimator_

training score for best hyperparameter 0.9252873563218391
test score for best hyperparameter 0.8984674329501917


In [283]:
tuned_param = SVC(class_weight = 'balanced', C = 0.1, kernel = 'linear')
tuned_param.fit(X_trainst1, y_trainst1)
tuned_param.score(X_testst1, y_testst1)

0.8869731800766284

## Trial 2

In [284]:
student_shuffled = student_df.sample(frac=1).reset_index(drop=True)
st_X = student_shuffled.drop(['G3'], axis=1).select_dtypes(include='number')
st_y = student_shuffled['G3'].apply(lambda x: 1 if x >= 10 else 0)
X_trainst, X_testst, y_trainst, y_testst = train_test_split(st_X, st_y, test_size=0.5, random_state=42)
tuned_param.fit(X_trainst, y_trainst)
tuned_param.score(X_testst, y_testst)

0.9099616858237548

## Trial 3

In [285]:
student_shuff = student_df.sample(frac=1).reset_index(drop=True)
st_X = student_shuff.drop(['G3'], axis=1).select_dtypes(include='number')
st_y = student_shuff['G3'].apply(lambda x: 1 if x >= 10 else 0)
X_train, X_test, y_train, y_test = train_test_split(st_X, st_y, test_size=0.5, random_state=42)
tuned_param.fit(X_train, y_train)
tuned_param.score(X_test, y_test)

0.8888888888888888

In [286]:
avg_sum_50_svm = (0.8869731800766284 + 0.9099616858237548 + 0.8888888888888888) / 3
avg_sum_50_svm

0.8952745849297573

In [287]:
(tuned_param.score(X_trainst, y_trainst) + 
tuned_ST.score(X_trainst1, y_trainst1) + 
tuned_ST.score(X_train, y_train)) / 3



0.8154533844189017

## 20/80 split

In [274]:
student_shuff = student_df.sample(frac=1).reset_index(drop=True)
st_X2 = student_shuff.drop(['G3'], axis=1).select_dtypes(include='number').to_numpy()
st_y2 = student_shuff['G3'].apply(lambda x: 1 if x >= 10 else 0).to_numpy()

## Trial 1

In [275]:
X_trainst1, X_testst1, y_trainst1, y_testst1 = train_test_split(st_X2, st_y2, test_size=0.8, random_state=42)
params = {'C': [0.1, 1, 10, 100], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
classifierSVM = SVC(class_weight = 'balanced')
class_hyper_tuneSVM = GridSearchCV(classifierSVM, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuneSVM.fit(X_trainst1, y_trainst1)
print("training score for best hyperparameter " + str(class_hyper_tuneSVM.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuneSVM.cv_results_.get('mean_test_score')[0]))
class_hyper_tuneSVM.best_estimator_

training score for best hyperparameter 0.9207242901331109
test score for best hyperparameter 0.8848861283643892


In [276]:
tuned_param = SVC(class_weight = 'balanced', C = 0.1, kernel = 'linear')
tuned_param.fit(X_trainst1, y_trainst1)
tuned_param.score(X_testst1, y_testst1)

0.8899521531100478

## Trial 2

In [277]:
student_shuffled = student_df.sample(frac=1).reset_index(drop=True)
st_X = student_shuffled.drop(['G3'], axis=1).select_dtypes(include='number')
st_y = student_shuffled['G3'].apply(lambda x: 1 if x >= 10 else 0)
X_trainst, X_testst, y_trainst, y_testst = train_test_split(st_X, st_y, test_size=0.8, random_state=42)
tuned_param.fit(X_trainst, y_trainst)
tuned_param.score(X_testst, y_testst)

0.888755980861244

## Trial 3

In [278]:
student_shuff = student_df.sample(frac=1).reset_index(drop=True)
st_X = student_shuff.drop(['G3'], axis=1).select_dtypes(include='number')
st_y = student_shuff['G3'].apply(lambda x: 1 if x >= 10 else 0)
X_train, X_test, y_train, y_test = train_test_split(st_X, st_y, test_size=0.8, random_state=42)
tuned_param.fit(X_train, y_train)
tuned_param.score(X_test, y_test)

0.8875598086124402

In [279]:
avg_svm_80 = (0.888755980861244 + 0.8875598086124402 + 0.8995215311004785) / 3
avg_svm_80

0.8919457735247208

In [280]:
(tuned_param.score(X_trainst, y_trainst) + 
tuned_ST.score(X_trainst1, y_trainst1) + 
tuned_ST.score(X_train, y_train)) / 3



0.828525641025641