# ML Cheatsheet

In [30]:
import numpy as np
from sklearn import model_selection
from sklearn import preprocessing

In [31]:
from sklearn import datasets
dt_bh = datasets.load_boston()    # Boston house price ; (506, 13)
dt_ir = datasets.load_iris()      # Iris 0, 1, 2 ; (150, 4) flowers. The latter [50:150] more difficult
dt_dg = datasets.load_digits()    # Digits 0, 1, 2, ..., 9 ; (1797, 64) 8x8 images

## Regression (Supervised Learning)

In [None]:
X = dt_bh.data
y = dt_bh.target

# Train test split
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
ss = preprocessing.StandardScaler()
print(ss)

X_tn = ss.fit_transform(X_tn)
X_tt = ss.transform(X_tt)

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
rgs_lin = LinearRegression()
print(rgs_lin)

rgs_lin.fit(X_tn, y_tn)
print(rgs_lin.score(X_tn, y_tn))
print(rgs_lin.score(X_tt, y_tt))

#rgs_lin.predict(X_tt)
#print(rgs_lin.coef_)
#print(rgs_lin.intercept_)

## Classification (Supervised Learning)

In [45]:
#X = dt_dg.data
#y = dt_dg.target

X = dt_ir.data[:]
y = dt_ir.target[:]

# Train test split
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
ss = preprocessing.StandardScaler()

X_tn = ss.fit_transform(X_tn)
X_tt = ss.transform(X_tt)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(random_state=27)
print(clf_lr)

clf_lr.fit(X_tn, y_tn)
print(clf_lr.score(X_tn, y_tn))
print(clf_lr.score(X_tt, y_tt))

#print(clf_lr.coef_)
#print(clf_lr.intercept_)

In [None]:
# kNN
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier(n_neighbors=5, p=2)
print(clf_knn)

clf_knn.fit(X_tn, y_tn)
print(clf_knn.score(X_tn, y_tn))
print(clf_knn.score(X_tt, y_tt))

#np.column_stack((clf_knn.predict(X_tt), clf_knn.predict_proba(X_tt)))

In [None]:
# SVM
from sklearn.svm import SVC
clf_svc = SVC(random_state=27)
print(clf_svc)

clf_svc.fit(X_tn, y_tn)
print(clf_svc.score(X_tn, y_tn))
print(clf_svc.score(X_tt, y_tt))

In [None]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB    # or BernoulliNB
clf_nb = GaussianNB()
print(clf_nb)

clf_nb.fit(X_tn, y_tn)
print(clf_nb.score(X_tn, y_tn))
print(clf_nb.score(X_tt, y_tt))

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier(criterion='gini', random_state=27)    # or criterion='entropy' (info gain)
print(clf_dt)

clf_dt.fit(X_tn, y_tn)    # Conditions are identified
print(clf_dt.score(X_tn, y_tn))
print(clf_dt.score(X_tt, y_tt))

In [None]:
# Random Forest: Collection of decision trees that use a random subset of training data(Bagging) and features --> majority vote
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(n_estimators=10, random_state=27)    # n_estimators: num of trees (higher if num of features higher)
print(clf_rf)

clf_rf.fit(X_tn, y_tn)    # Conditions are identified
print(clf_rf.score(X_tn, y_tn))
print(clf_rf.score(X_tt, y_tt))

In [None]:
# Gradient Boosting & Ada Boost
from sklearn.ensemble import GradientBoostingClassifier
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=5, random_state=27)
print(clf_gb)

clf_gb.fit(X_tn, y_tn)
print(clf_gb.score(X_tn, y_tn))
print(clf_gb.score(X_tt, y_tt))

In [47]:
# GridSearchCV : Exhaustive search of hyper-parameters for an estimator
from sklearn.metrics import classification_report

model = clf_knn
param_grid = [{'n_neighbors': [3, 5, 10, 15], 'p': [1, 2]}]
scorings = ['accuracy']    # 'accuracy', 'precision', 'recall', 'f1'

for scoring in scorings:
    print('\n# Tuning hyper-parameters for %s' % scoring)
    gcv = model_selection.GridSearchCV(model, param_grid, scoring, cv=5, n_jobs=4, verbose=1)
    gcv.fit(X_tn, y_tn)

    means = gcv.cv_results_['mean_test_score']
    stds  = gcv.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, gcv.cv_results_['params']):
        print('%.3f (+/-%.3f) for %r' % (mean, std * 2, params))
    print('\n# Best parameters on development set:', gcv.best_params_)

    print('\n# Scores computed on evaluation set:\n')
    print(classification_report(y_tt, gcv.predict(X_tt), digits=3))

print(gcv)
#print(gcv.cv_results_)


# Tuning hyper-parameters for accuracy
Fitting 5 folds for each of 8 candidates, totalling 40 fits
0.962 (+/-0.099) for {'n_neighbors': 3, 'p': 1}
0.943 (+/-0.149) for {'n_neighbors': 3, 'p': 2}
0.962 (+/-0.099) for {'n_neighbors': 5, 'p': 1}
0.933 (+/-0.177) for {'n_neighbors': 5, 'p': 2}
0.971 (+/-0.080) for {'n_neighbors': 10, 'p': 1}
0.971 (+/-0.080) for {'n_neighbors': 10, 'p': 2}
0.962 (+/-0.075) for {'n_neighbors': 15, 'p': 1}
0.962 (+/-0.073) for {'n_neighbors': 15, 'p': 2}

# Best parameters on development set: {'n_neighbors': 10, 'p': 1}

# Scores computed on evaluation set:

             precision    recall  f1-score   support

          0      1.000     1.000     1.000        12
          1      0.875     0.875     0.875        16
          2      0.882     0.882     0.882        17

avg / total      0.911     0.911     0.911        45

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
        

[Parallel(n_jobs=4)]: Done  40 out of  40 | elapsed:    5.1s finished


## Clustering (Unsupervised Learning)

In [None]:
X = dt_ir.data[:100]    # The first 100 are easier
y = dt_ir.target[:100]

# Train test split
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
ss = preprocessing.StandardScaler()
print(ss)

X_tn = ss.fit_transform(X_tn)
X_tt = ss.transform(X_tt)

In [None]:
# k-Means
from sklearn.cluster import KMeans
clu_km = KMeans(n_clusters=2, random_state=27)
print(clu_km)

clu_km.fit(X_tn)
clu_km.predict(X_tt)

In [None]:
y_tt    # Prediction above should cluster similarly

## Dimensionality Reduction (Unsupervised Learning)

In [None]:
X = dt_ir.data
y = dt_ir.target

# Train test split
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
ss = preprocessing.StandardScaler()
print(ss)

X_tn = ss.fit_transform(X_tn)
X_tt = ss.transform(X_tt)

In [None]:
# PCA
from sklearn.decomposition import PCA
dr_pca = PCA(n_components=3, random_state=27)
print(dr_pca)

X_tn_reduced = dr_pca.fit_transform(X_tn)
X_tt_reduced = dr_pca.transform(X_tt)

X_tt_reduced.shape