### ws01

In [None]:
import pandas as pd
# import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

cols = ['preg', 'plas', 'pres', 'skin', 'insu', 'bmi', 'pedi', 'age', 'class']
df = pd.read_csv("data/pima-indians-diabetes.csv", names=cols)
print(df.shape)
df.head()

In [None]:
df.describe().round(2)

In [None]:
df.drop('insu', axis=1, inplace=True)

df['bmi'] = df.bmi.replace(0, df.bmi.median())
df['plas'] = df.plas.replace(0, df.plas.median())
df['pres'] = df.pres.replace(0, df.pres.median())
df['skin'] = df.skin.replace(0, df.skin.median())
# df['insu'] = df.insu.replace(0, df.insu.median())

In [None]:
df.describe().round(2)

In [None]:
X = df.drop('class', axis=1) #.copy()
y = df['class']
print(y.unique()) 

## Decision Tree

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion='entropy', max_depth=None) 
# model = DecisionTreeClassifier(criterion='gini', max_depth=None)  

cvs = cross_val_score(model, X, y, cv=10)  

print('cross val scores {}'.format(cvs.round(2)))
print('Average (%) = {:.2f}' .format(cvs.mean() * 100))

In [None]:
model

## ws02

In [None]:

from sklearn.preprocessing import MinMaxScaler as Scaler

X = X.astype('float64')

sc = Scaler()   
X_sc = sc.fit_transform(X)

df_sc = pd.DataFrame(X_sc, columns=X.columns)
df_sc.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


In [None]:
models = []
models.append(('LR', LogisticRegression(solver='lbfgs')))
models.append(('kNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVC', SVC(gamma='auto')))
models.append(('RFC', RandomForestClassifier(n_estimators=10)))
models.append(('DT', DecisionTreeClassifier()))


In [None]:
from sklearn.model_selection import cross_val_score

results = []
names = []

for name, model in models:
    cvs = cross_val_score(model, X_sc, y, cv=10) 
    results.append(cvs)
    names.append(name)
    print('{:4} {:.3f} ({:.3f})'.format(name, cvs.mean(), cvs.std()) )

In [None]:
results[:2]

In [None]:

fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)  
plt.boxplot(results)
ax.set_xticklabels(names)

plt.show()

## ws03

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [1.0, 3.0, 5.0], 
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'shrinking': [True, False],
    'gamma': ['auto', 1, 0.1],
    'coef0': [0.0, 0.1, 0.5]
}

model = SVC()

grid_search = GridSearchCV(
    model, param_grid, cv=10, scoring='accuracy', verbose=1)
grid_search.fit(X_sc, y)

grid_search.best_score_ #, grid_search.best_estimator_

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
# print(grid_search.best_score_)
# print(grid_search.best_estimator_)
print(grid_search.best_estimator_.C)
print(grid_search.best_estimator_.kernel)
print(grid_search.best_estimator_.gamma)
print(grid_search.best_estimator_.coef0)

In [None]:
from sklearn.model_selection import cross_val_score

model = grid_search.best_estimator_

cvs = cross_val_score(model, X_sc, y, cv=10)  

print('cross val scores {}'.format(cvs.round(2)))
print('Average (%) = {:.2f}' .format(cvs.mean() * 100 ))

In [None]:
model

## ws04

In [None]:
model = SVC(C=3.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
model.fit(X_sc, y)

In [None]:
new_df = pd.DataFrame([[5, 145, 70, 34, 32, .6, 49],
                       [2, 84,  66, 28, 25, .4, 30]])

# new_df = pd.DataFrame([[0, 140, 41, 34, 42, 1.9, 35]])

new_df_sc = sc.transform(new_df)  
predict = model.predict(new_df_sc)

print(predict)

## ws05

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform , randint 

param_dist = {
    'C': uniform(1.0,5.0),
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'shrinking': [True, False],
    'gamma': ['auto', 1, 0.1],
    'coef0': [0.0, 0.1, 0.5]
}

model = SVC()

rnd_search = RandomizedSearchCV(
    model, param_dist, n_iter=100, cv=10, scoring='accuracy',
    random_state=10, verbose=0)
rnd_search.fit(X_sc, y)

rnd_search.best_score_

In [None]:
rnd_search.best_params_

In [None]:
rnd_search.best_estimator_

In [None]:
from sklearn.model_selection import cross_val_score

model = grid_search.best_estimator_

cvs = cross_val_score(model, X_sc, y, cv=10) 

print('cross val scores {}'.format(cvs.round(2)))
print('Average (%) = {:.2f}' .format(cvs.mean() * 100 ))

## ws06

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini','entropy'],
    'max_depth': [2, 3, 4, 5],
    'min_samples_split': [2, 3, 4, 5]
}

model = DecisionTreeClassifier()

grid_search = GridSearchCV(
    model, param_grid, cv=10, scoring='accuracy', verbose=1)
grid_search.fit(X_sc, y)

grid_search.best_score_

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
# ws07
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform , randint

param_dist = {
    'criterion': ['gini','entropy'],
    'max_depth':  randint(2, 5),
    'min_samples_split': randint(2, 5)
}

model = DecisionTreeClassifier()

rnd_search = RandomizedSearchCV(
    model, param_dist, cv=10, scoring='accuracy',
    random_state=10, verbose=1)
rnd_search.fit(X_sc, y)

rnd_search.best_score_

In [None]:
rnd_search.best_params_

In [None]:
rnd_search.best_estimator_