# Examples for Day 3

In [11]:
# initialization
%matplotlib inline
from warnings import filterwarnings
filterwarnings("ignore")

In [12]:
# %load ml31.py
# MLP Classification
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.neural_network import MLPClassifier
header = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=header)
array = df.values
X = array[:,:-1]
y = array[:,-1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(10, 10), random_state=42).fit(X_train, y_train)
print(f'Accuracy: {mlp.score(X_test, y_test):.2%}')


Accuracy: 71.35%


In [13]:
# Perform feature scaling
from sklearn.preprocessing import StandardScaler

scl = StandardScaler()
Xs_train = scl.fit_transform(X_train)
Xs_test = scl.transform(X_test)

mlp2 = MLPClassifier(hidden_layer_sizes=(10, 10), random_state=42).fit(Xs_train, y_train)
print(f'Accuracy: {mlp2.score(Xs_test, y_test):.2%}')

Accuracy: 75.00%


In [16]:
# %load ml32.py
# Hyperparameter tuning with grid search
from pandas import read_csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split as split, KFold
header = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=header)
X = df.drop(columns=['class'])
y = df['class']
X_train, X_test, y_train, y_test = split(X, y, random_state=42)
model1 = DecisionTreeClassifier(random_state=42)
model1.fit(X_train, y_train)
print(f'Train accuracy without tuning: {model1.score(X_train, y_train):.2%}')
print(f'Test accuracy without tuning: {model1.score(X_test, y_test):.2%}')
kf = KFold(n_splits=5, shuffle=True, random_state=42)
params = dict(criterion=['gini', 'entropy'], max_leaf_nodes=range(2, 21))
grid = GridSearchCV(DecisionTreeClassifier(), params, cv=kf, n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)
print(grid.best_params_)
model2 = DecisionTreeClassifier(**grid.best_params_, random_state=42).fit(X_train, y_train)
print(f'Train accuracy with tuning: {model2.score(X_train, y_train):.2%}')
print(f'Test accuracy with tuning: {model2.score(X_test, y_test):.2%}')

Train accuracy without tuning: 100.00%
Test accuracy without tuning: 70.83%
Fitting 5 folds for each of 38 candidates, totalling 190 fits
{'criterion': 'gini', 'max_leaf_nodes': 7}
Train accuracy with tuning: 79.51%
Test accuracy with tuning: 74.48%


In [18]:
# %load ml33.py
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
header = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=header)
array = df.values
X = array[:,:-1]
y = array[:,-1]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
pipe = Pipeline([('scaler', MinMaxScaler()), ('clf', SVC())])
pipe.fit(X_train, y_train)
print(f'Accuracy: {pipe.score(X_test, y_test):.2%}')

Accuracy: 73.96%


In [22]:
# Include one more step (feature selection) into the pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

pipe2 = Pipeline([('scaler', MinMaxScaler()), ('feature selection', SelectFromModel(RandomForestClassifier(), threshold='median')), ('clf', SVC())])
pipe2.fit(X_train, y_train)
print(f'Accuracy: {pipe2.score(X_test, y_test):.2%}')

Accuracy: 74.48%


In [23]:
pipe2

In [32]:
from sklearn.model_selection import train_test_split as split, KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from pandas import read_csv

df = read_csv("data/titanic_processed.csv")
df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived,Gender_female,Gender_male
0,3,22.0,1,0,7.25,0,False,True
1,1,38.0,1,0,71.2833,1,True,False
2,3,26.0,0,0,7.925,1,True,False
3,1,35.0,1,0,53.1,1,True,False
4,3,35.0,0,0,8.05,0,False,True


In [25]:
X = df.drop(columns=["Survived"])
y = df["Survived"]

X_train, X_test, y_train, y_test = split(X, y, stratify=y, test_size=0.25, random_state=42)

In [33]:
mypipe = Pipeline([('fs', None), ('selector', SelectKBest()),  ('clf', KNeighborsClassifier())])

params1 = {}
params1['fs'] = [None, MinMaxScaler(), StandardScaler(), RobustScaler()]
params1['selector__k'] = range(1, 9)
params1['clf'] = [KNeighborsClassifier()]
params1['clf__n_neighbors'] = range(3, 10, 2)
params1['clf__weights'] = ['uniform', 'distance']

params2 = {}
params2['fs'] = [None, MinMaxScaler(), StandardScaler(), RobustScaler()]
params2['selector__k'] = range(1, 9)
params2['clf'] = [DecisionTreeClassifier()]
params2['clf__criterion'] = ['gini', 'entropy']
params2['clf__max_leaf_nodes'] = range(2, 21)

params = [params1, params2]

gs = GridSearchCV(mypipe, params, cv=5, n_jobs=-1, verbose=2)
gs.fit(X_train, y_train)
print(gs.best_params_)

Fitting 5 folds for each of 1472 candidates, totalling 7360 fits
{'clf': KNeighborsClassifier(), 'clf__n_neighbors': 9, 'clf__weights': 'uniform', 'fs': StandardScaler(), 'selector__k': 7}


In [34]:
best_pipe = gs.best_estimator_
best_pipe.fit(X_train, y_train)

print(f"Best pipe accuracy: {best_pipe.score(X_test, y_test):.3%}")

Best pipe accuracy: 78.924%


In [35]:
knn = KNeighborsClassifier().fit(X_train, y_train)
print(f"knn accuracy: {knn.score(X_test, y_test):.3%}")

knn accuracy: 67.265%


In [36]:
best_pipe