# Examples for Day 3

In [1]:
# initialization
%matplotlib inline
from warnings import filterwarnings
filterwarnings("ignore")

In [3]:
# %load ml29.py
# Random Forest Classification
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.ensemble import RandomForestClassifier
header = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=header)
X = df.drop(columns=['class'])
y = df['class']
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
rfc = RandomForestClassifier(max_leaf_nodes=6, random_state=42).fit(X_train, y_train)
print(f'Accuracy: {rfc.score(X_test, y_test):.2%}')


Accuracy: 75.00%


In [5]:
# %load ml30.py
# Gradient Boosting Tree Classification
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.ensemble import GradientBoostingClassifier
header = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=header)
X = df.drop(columns=['class'])
y = df['class']
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
gbc = GradientBoostingClassifier(max_depth=2, random_state=42).fit(X_train, y_train)
print(f'Accuracy: {gbc.score(X_test, y_test):.2%}')

Accuracy: 74.48%


In [9]:
# Voting Classifier
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

header = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=header)
X = df.drop(columns=['class'])
y = df['class']
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)

clf1 = LogisticRegression()
clf2 = GaussianNB()
clf3 = SVC()
clf4 = KNeighborsClassifier()

vtc = VotingClassifier(estimators=[('lgr', clf1), ('gnb', clf2), ('svc', clf3), ('knn', clf4)], voting='hard').fit(X_train, y_train)
print(f"Accuracy: {vtc.score(X_test, y_test):.2%}")

Accuracy: 72.92%


In [11]:
# %load ml31.py
# MLP Classification
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.neural_network import MLPClassifier
header = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=header)
X = df.drop(columns=['class'])
y = df['class']
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(10, 10), random_state=42).fit(X_train, y_train)
print(f'Accuracy: {mlp.score(X_test, y_test):.2%}')


Accuracy: 71.35%


In [12]:
# Perform feature scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
Xs_train = scaler.fit_transform(X_train)
Xs_test = scaler.transform(X_test)

mlp2 = MLPClassifier(hidden_layer_sizes=(10, 10), random_state=42).fit(Xs_train, y_train)
print(f'Accuracy: {mlp2.score(Xs_test, y_test):.2%}')

Accuracy: 72.92%


In [15]:
# %load ml32.py
# Hyperparameter tuning with grid search
from pandas import read_csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split as split, KFold
header = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=header)
X = df.drop(columns=['class'])
y = df['class']
X_train, X_test, y_train, y_test = split(X, y, random_state=42)
model1 = DecisionTreeClassifier(random_state=42)
model1.fit(X_train, y_train)
print(f'Accuracy without tuning: {model1.score(X_test, y_test):.2%}')
kf = KFold(n_splits=5, shuffle=True, random_state=42)
params = dict(criterion=['gini', 'entropy'], max_leaf_nodes=range(2, 21))
grid = GridSearchCV(DecisionTreeClassifier(), params, cv=kf, n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)
print(grid.best_params_)
model2 = DecisionTreeClassifier(**grid.best_params_, random_state=42).fit(X_train, y_train)
print(f'Accuracy with tuning: {model2.score(X_test, y_test):.2%}')


Accuracy without tuning: 70.83%
Fitting 5 folds for each of 38 candidates, totalling 190 fits
{'criterion': 'gini', 'max_leaf_nodes': 7}
Accuracy with tuning: 74.48%


In [17]:
# %load ml33.py
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
header = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv('data/pima-indians-diabetes.data.csv', names=header)
X = df.drop(columns=['class'])
y = df['class']
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
pipe = Pipeline([('scaler', MinMaxScaler()), ('clf', SVC())])
pipe.fit(X_train, y_train)
print(f'Accuracy: {pipe.score(X_test, y_test):.2%}')

Accuracy: 73.96%


In [29]:
# Use GridSearchCV with pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline

pipe2 = Pipeline([('scaler', None), ('clf', SVC())])
params = {}
params['scaler'] = [None, MinMaxScaler(), StandardScaler(), RobustScaler()]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(pipe2, params, cv=kf, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
print(grid.best_params_)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
{'scaler': RobustScaler()}


In [30]:
best_pipe = grid.best_estimator_
best_pipe.fit(X_train, y_train)
print(f'Accuracy: {best_pipe.score(X_test, y_test):.2%}')

Accuracy: 74.48%


In [32]:
from pandas import read_csv
from sklearn.model_selection import train_test_split as split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

df = read_csv("data/titanic_processed.csv")
df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived,Gender_female,Gender_male
0,3,22.0,1,0,7.25,0,False,True
1,1,38.0,1,0,71.2833,1,True,False
2,3,26.0,0,0,7.925,1,True,False
3,1,35.0,1,0,53.1,1,True,False
4,3,35.0,0,0,8.05,0,False,True


In [33]:
X = df.drop(columns=["Survived"])
y = df["Survived"]
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, stratify=y, random_state=42)

In [36]:
mypipe = Pipeline([('scl', None), ('fs', SelectKBest()), ('clf', KNeighborsClassifier())])

params1 = {}
params1['scl'] = [None, MinMaxScaler(), StandardScaler(), RobustScaler()]
params1['fs__k'] = range(1, 8)
params1['clf'] = [KNeighborsClassifier()]
params1['clf__n_neighbors'] = range(3, 10, 2)
params1['clf__weights'] = ['uniform', 'distance']

params2 = {}
params2['scl'] = [None, MinMaxScaler(), StandardScaler(), RobustScaler()]
params2['fs__k'] = range(1, 8)
params2['clf'] = [DecisionTreeClassifier()]
params2['clf__criterion'] = ['entropy', 'gini']
params2['clf__max_leaf_nodes'] = range(2, 21)

params = [params1, params2]

grid = GridSearchCV(mypipe, params, cv=5, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
print(grid.best_params_)

Fitting 5 folds for each of 1288 candidates, totalling 6440 fits
{'clf': KNeighborsClassifier(), 'clf__n_neighbors': 9, 'clf__weights': 'uniform', 'fs__k': 7, 'scl': StandardScaler()}


In [37]:
best_pipe = grid.best_estimator_
best_pipe.fit(X_train, y_train)
print(f"best pipe accuracy: {best_pipe.score(X_test, y_test):.2%}")

best pipe accuracy: 78.92%


In [38]:
best_pipe

In [39]:
knn = KNeighborsClassifier().fit(X_train, y_train)
print(f"Plain knn accuracy: {knn.score(X_test, y_test):.2%}")

Plain knn accuracy: 67.26%
