# ML Cheatsheet

In [1]:
import numpy as np
from sklearn import model_selection, preprocessing, metrics

In [2]:
from sklearn import datasets
dt_bh = datasets.load_boston()    # Boston house price (506, 13)
dt_ir = datasets.load_iris()      # Iris {0, 1, 2} (150, 4) flowers. The latter [50:150] more difficult
dt_dg = datasets.load_digits()    # Digits {0, 1, ..., 9} (1797, 64) 8x8 images

## I. Supervised Learning

In [3]:
# Demonstrate the Supervised Learning model
def DemoSL(model):
    print(model)
    model.fit(X_tn, y_tn)
    print(model.score(X_tn, y_tn))
    print(model.score(X_tt, y_tt))

### I.1. Regression

In [4]:
X = dt_bh.data
y = dt_bh.target

# Train test split
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=100, random_state=27)

# Normalization
ss = preprocessing.StandardScaler()
X_tn = ss.fit_transform(X_tn)
print(ss)

X_tt = ss.transform(X_tt)

StandardScaler(copy=True, with_mean=True, with_std=True)


In [5]:
# Linear Regression
from sklearn.linear_model import LinearRegression
rgs_lin = LinearRegression()
DemoSL(rgs_lin)
# Note that the score is not accuracy (percentage)

#rgs_lin.predict(X_tt)
#print(rgs_lin.coef_)
#print(rgs_lin.intercept_)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
0.747301733399
0.70416565463


### I.2. Classification

In [6]:
X = dt_dg.data
y = dt_dg.target

# Train test split
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
ss = preprocessing.StandardScaler()
X_tn = ss.fit_transform(X_tn)

X_tt = ss.transform(X_tt)

In [7]:
# GridSearchCV : Exhaustive search of hyper-parameters for an estimator
def gsCV(model, param_grid, scorings):

    for scoring in scorings:
        print('\n# Tuning hyper-parameters for %s' % scoring)
        gcv = model_selection.GridSearchCV(model, param_grid, scoring, cv=5, n_jobs=4, verbose=1)
        gcv.fit(X_tn, y_tn)

        means = gcv.cv_results_['mean_test_score']
        stds  = gcv.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, gcv.cv_results_['params']):
            print('%.3f (+/-%.3f) for %r' % (mean, std * 2, params))
        print('\n# Best parameters on development set:', gcv.best_params_)

        print('\n# Scores computed on evaluation set:\n')
        print(metrics.classification_report(y_tt, gcv.predict(X_tt), digits=3))

    print(gcv)
    #print(gcv.cv_results_)

scorings = ['accuracy']    # 'accuracy', 'precision', 'recall', 'f1'

In [8]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression(random_state=27)
DemoSL(clf_log)

#print(clf_log.coef_)
#print(clf_log.intercept_)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=27, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.989657915672
0.966666666667


In [9]:
# kNN (Instance-based learning)
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier(n_neighbors=5, p=2)
DemoSL(clf_knn)

#np.column_stack((clf_knn.predict(X_tt), np.round(clf_knn.predict_proba(X_tt), 3)))

param_grid = [{'n_neighbors': [3, 5, 10, 15], 'p': [1, 2]}]
#gsCV(clf_knn, param_grid, scorings)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.982498011138
0.975925925926


In [10]:
# SVM
from sklearn.svm import SVC
clf_svc = SVC(kernel='rbf', gamma=0.03, C=1, random_state=27)    # gamma for 'rbf', 'poly', 'sigmoid'
DemoSL(clf_svc)

param_grid = [{'kernel': ['rbf', 'linear', 'poly'], 'C': [3, 10, 20]}]
#gsCV(clf_svc, param_grid, scorings)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.03, kernel='rbf',
  max_iter=-1, probability=False, random_state=27, shrinking=True,
  tol=0.001, verbose=False)
0.998408910103
0.985185185185


In [11]:
# Neural Network
from sklearn.neural_network import MLPClassifier
clf_mlp = MLPClassifier((100, 50, 20), learning_rate_init=0.05, alpha=0.05, verbose=1, random_state=27)
DemoSL(clf_mlp)

MLPClassifier(activation='relu', alpha=0.05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 50, 20), learning_rate='constant',
       learning_rate_init=0.05, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=27, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=1,
       warm_start=False)
Iteration 1, loss = 1.78878120
Iteration 2, loss = 0.66641602
Iteration 3, loss = 0.40429631
Iteration 4, loss = 0.25460711
Iteration 5, loss = 0.23522668
Iteration 6, loss = 0.21030993
Iteration 7, loss = 0.17143242
Iteration 8, loss = 0.18908945
Iteration 9, loss = 0.22671110
Iteration 10, loss = 0.19936339
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
0.989657915672
0.959259259259


In [12]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB    # or BernoulliNB
clf_nb = GaussianNB()
DemoSL(clf_nb)

GaussianNB(priors=None)
0.826571201273
0.824074074074


In [13]:
# Gaussian Process
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
clf_gp = GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True, max_iter_predict=2, n_jobs=4, random_state=27)
#DemoSL(clf_gp)    # Commented out because it takes too long for dt_dg. It is quick for dt_ir

In [14]:
# QDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
clf_qda = QuadraticDiscriminantAnalysis()
DemoSL(clf_qda)

QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariances=False, tol=0.0001)
0.936356404137
0.833333333333




In [15]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier(max_depth=7, criterion='entropy', random_state=27)    # criterion='gini' or 'entropy' (info gain)
DemoSL(clf_dt)

param_grid = [{'max_depth': [10, 20, 40], 'max_features': [32, None], 'criterion': ['gini', 'entropy']}]
#gsCV(clf_dt, param_grid, scorings)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=27, splitter='best')
0.965791567224
0.874074074074


In [16]:
# Random Forest: Collection of decision trees that use a random subset of training data(Bagging) and features --> majority vote
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(n_estimators=300, max_depth=10, random_state=27)
DemoSL(clf_rf)

param_grid = [{'max_depth': [7, 10, 15], 'n_estimators': [50, 100, 300]}]
#gsCV(clf_rf, param_grid, scorings)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=27,
            verbose=0, warm_start=False)
1.0
0.977777777778


In [17]:
# Ada Boost
from sklearn.ensemble import AdaBoostClassifier
clf_ab = AdaBoostClassifier(n_estimators=300, learning_rate=0.01, random_state=27)
DemoSL(clf_ab)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.01, n_estimators=300, random_state=27)
0.768496420048
0.733333333333


In [18]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=5, random_state=27)
DemoSL(clf_gb)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=27,
              subsample=1.0, verbose=0, warm_start=False)
1.0
0.95


## II. Unsupervised Learning

### II.1. Clustering

In [19]:
X = dt_ir.data[:100]    # The first 100 are easier
y = dt_ir.target[:100]

# Train test split
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
ss = preprocessing.StandardScaler()
X_tn = ss.fit_transform(X_tn)

X_tt = ss.transform(X_tt)

In [20]:
# k-Means
from sklearn.cluster import KMeans
clu_km = KMeans(n_clusters=2, random_state=27)
print(clu_km)

clu_km.fit(X_tn)
clu_km.predict(X_tt)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=27, tol=0.0001, verbose=0)


array([1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0])

In [21]:
y_tt    # Prediction above should cluster similarly

array([1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0])

### II.2. Dimensionality Reduction

In [22]:
X = dt_ir.data
y = dt_ir.target

# Train test split
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
ss = preprocessing.StandardScaler()
X_tn = ss.fit_transform(X_tn)

X_tt = ss.transform(X_tt)

In [23]:
# PCA
from sklearn.decomposition import PCA
dr_pca = PCA(n_components=3, random_state=27)
print(dr_pca)

X_tn_reduced = dr_pca.fit_transform(X_tn)
X_tt_reduced = dr_pca.transform(X_tt)

X_tt_reduced.shape

PCA(copy=True, iterated_power='auto', n_components=3, random_state=27,
  svd_solver='auto', tol=0.0, whiten=False)


(45, 3)