# ML Cheatsheet

In [1]:
import numpy as np
from sklearn import model_selection
from sklearn import preprocessing

In [2]:
from sklearn import datasets
dt_bh = datasets.load_boston()    # Boston house price ; (506, 13)
dt_ir = datasets.load_iris()      # Iris 0, 1, 2 ; (150, 4) flowers. The latter [50:150] more difficult
dt_dg = datasets.load_digits()    # Digits 0, 1, 2, ..., 9 ; (1797, 64) 8x8 images

## Regression (Supervised Learning)

In [3]:
X = dt_bh.data
y = dt_bh.target

# Train test split
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
ss = preprocessing.StandardScaler()
print(ss)

X_tn = ss.fit_transform(X_tn)
X_tt = ss.transform(X_tt)

StandardScaler(copy=True, with_mean=True, with_std=True)


In [4]:
# Linear Regression
from sklearn.linear_model import LinearRegression
rgs_lin = LinearRegression()
print(rgs_lin)

rgs_lin.fit(X_tn, y_tn)
print(rgs_lin.score(X_tn, y_tn))
print(rgs_lin.score(X_tt, y_tt))

#rgs_lin.predict(X_tt)
#print(rgs_lin.coef_)
#print(rgs_lin.intercept_)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
0.746428671695
0.708936259481


## Classification (Supervised Learning)

In [5]:
X = dt_dg.data
y = dt_dg.target

# Train test split
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
ss = preprocessing.StandardScaler()

X_tn = ss.fit_transform(X_tn)
X_tt = ss.transform(X_tt)

In [6]:
# GridSearchCV : Exhaustive search of hyper-parameters for an estimator
from sklearn.metrics import classification_report

# kNN
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier()

param_grid = [{'n_neighbors': [3, 5, 10, 15], 'p': [1, 2]}]
scores = ['accuracy']    # 'accuracy', 'precision', 'recall'
for score in scores:
    print('\n# Tuning hyper-parameters for %s' % score)
    gs = model_selection.GridSearchCV(clf_knn, param_grid, scoring=score, cv=5, n_jobs=4, verbose=1)
    gs.fit(X_tn, y_tn)

    means = gs.cv_results_['mean_test_score']
    stds = gs.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, gs.cv_results_['params']):
        print('%0.3f (+/-%0.3f) for %r' % (mean, std * 2, params))
    print('\n# Best parameters on development set:', gs.best_params_)

    print('\n# Scores computed on evaluation set:\n')
    print(classification_report(y_tt, gs.predict(X_tt), digits=3))

print(gs)
#print(clf.cv_results_)


# Tuning hyper-parameters for accuracy
Fitting 5 folds for each of 8 candidates, totalling 40 fits
0.966 (+/-0.024) for {'n_neighbors': 3, 'p': 1}
0.971 (+/-0.012) for {'n_neighbors': 3, 'p': 2}
0.968 (+/-0.027) for {'n_neighbors': 5, 'p': 1}
0.971 (+/-0.013) for {'n_neighbors': 5, 'p': 2}
0.963 (+/-0.016) for {'n_neighbors': 10, 'p': 1}
0.966 (+/-0.013) for {'n_neighbors': 10, 'p': 2}
0.951 (+/-0.015) for {'n_neighbors': 15, 'p': 1}
0.955 (+/-0.013) for {'n_neighbors': 15, 'p': 2}

# Best parameters on development set: {'n_neighbors': 3, 'p': 2}

# Scores computed on evaluation set:

             precision    recall  f1-score   support

          0      1.000     1.000     1.000        48
          1      0.945     1.000     0.972        52
          2      0.981     0.964     0.972        55
          3      0.980     0.962     0.971        52
          4      0.981     0.930     0.955        57
          5      1.000     0.982     0.991        56
          6      0.984     1.000   

[Parallel(n_jobs=4)]: Done  40 out of  40 | elapsed:    2.5s finished


In [7]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(random_state=27)
print(clf_lr)

clf_lr.fit(X_tn, y_tn)
print(clf_lr.score(X_tn, y_tn))
print(clf_lr.score(X_tt, y_tt))

#print(clf_lr.coef_)
#print(clf_lr.intercept_)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=27, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.989657915672
0.966666666667


In [8]:
# kNN
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier(n_neighbors=5, p=2)
print(clf_knn)

clf_knn.fit(X_tn, y_tn)
print(clf_knn.score(X_tn, y_tn))
print(clf_knn.score(X_tt, y_tt))

#np.column_stack((clf_knn.predict(X_tt), clf_knn.predict_proba(X_tt)))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.982498011138
0.975925925926


In [9]:
# SVM
from sklearn.svm import SVC
clf_svc = SVC(random_state=27)
print(clf_svc)

clf_svc.fit(X_tn, y_tn)
print(clf_svc.score(X_tn, y_tn))
print(clf_svc.score(X_tt, y_tt))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=27, shrinking=True,
  tol=0.001, verbose=False)
0.997613365155
0.983333333333


In [10]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB    # or BernoulliNB
clf_nb = GaussianNB()
print(clf_nb)

clf_nb.fit(X_tn, y_tn)
print(clf_nb.score(X_tn, y_tn))
print(clf_nb.score(X_tt, y_tt))

GaussianNB(priors=None)
0.826571201273
0.824074074074


In [11]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier(criterion='gini', random_state=27)    # or criterion='entropy' (info gain)
print(clf_dt)

clf_dt.fit(X_tn, y_tn)    # Conditions are identified
print(clf_dt.score(X_tn, y_tn))
print(clf_dt.score(X_tt, y_tt))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=27, splitter='best')
1.0
0.85


In [12]:
# Random Forest: Collection of decision trees that use a random subset of training data(Bagging) and features --> majority vote
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(n_estimators=10, random_state=27)    # n_estimators: num of trees (higher if num of features higher)
print(clf_rf)

clf_rf.fit(X_tn, y_tn)    # Conditions are identified
print(clf_rf.score(X_tn, y_tn))
print(clf_rf.score(X_tt, y_tt))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=27,
            verbose=0, warm_start=False)
0.999204455052
0.951851851852


In [13]:
# Gradient Boosting & Ada Boost
from sklearn.ensemble import GradientBoostingClassifier
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=5, random_state=27)
print(clf_gb)

clf_gb.fit(X_tn, y_tn)
print(clf_gb.score(X_tn, y_tn))
print(clf_gb.score(X_tt, y_tt))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=27,
              subsample=1.0, verbose=0, warm_start=False)
1.0
0.92962962963


## Clustering (Unsupervised Learning)

In [14]:
X = dt_ir.data[:100]    # The first 100 are easier
y = dt_ir.target[:100]

# Train test split
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
ss = preprocessing.StandardScaler()
print(ss)

X_tn = ss.fit_transform(X_tn)
X_tt = ss.transform(X_tt)

StandardScaler(copy=True, with_mean=True, with_std=True)


In [15]:
# k-Means
from sklearn.cluster import KMeans
clu_km = KMeans(n_clusters=2, random_state=27)
print(clu_km)

clu_km.fit(X_tn)
clu_km.predict(X_tt)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=27, tol=0.0001, verbose=0)


array([1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0])

In [16]:
y_tt    # Prediction above should cluster similarly

array([1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0])

## Dimensionality Reduction (Unsupervised Learning)

In [17]:
X = dt_ir.data
y = dt_ir.target

# Train test split
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
ss = preprocessing.StandardScaler()
print(ss)

X_tn = ss.fit_transform(X_tn)
X_tt = ss.transform(X_tt)

StandardScaler(copy=True, with_mean=True, with_std=True)


In [18]:
# PCA
from sklearn.decomposition import PCA
dr_pca = PCA(n_components=3, random_state=27)
print(dr_pca)

X_tn_reduced = dr_pca.fit_transform(X_tn)
X_tt_reduced = dr_pca.transform(X_tt)

X_tt_reduced.shape

PCA(copy=True, iterated_power='auto', n_components=3, random_state=27,
  svd_solver='auto', tol=0.0, whiten=False)


(45, 3)