# ML Cheatsheet

In [1]:
import numpy as np

# Classification (Supervised Learning)

In [2]:
# Iris data
from sklearn import datasets
temp = datasets.load_iris()

In [3]:
X = temp.data[50:]
y = temp.target[50:]
#np.column_stack([X, y])

# Train test split
from sklearn import model_selection
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
from sklearn import preprocessing
ss = preprocessing.StandardScaler()
print(ss)

X_tn = ss.fit_transform(X_tn)
X_tt = ss.transform(X_tt)

StandardScaler(copy=True, with_mean=True, with_std=True)


In [4]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression()
print(clf_lr)

clf_lr.fit(X_tn, y_tn)
print(clf_lr.score(X_tn, y_tn))
print(clf_lr.score(X_tt, y_tt))

#print(clf_lr.coef_)
#print(clf_lr.intercept_)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.971428571429
0.933333333333


In [5]:
# kNN
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier(n_neighbors=5)
print(clf_knn)

clf_knn.fit(X_tn, y_tn)
print(clf_knn.score(X_tn, y_tn))
print(clf_knn.score(X_tt, y_tt))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.957142857143
0.9


In [6]:
# SVM
from sklearn.svm import SVC
clf_svc = SVC()
print(clf_svc)

clf_svc.fit(X_tn, y_tn)
print(clf_svc.score(X_tn, y_tn))
print(clf_svc.score(X_tt, y_tt))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.985714285714
0.933333333333


In [7]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB    # or BernoulliNB
clf_nb = GaussianNB()
print(clf_nb)

clf_nb.fit(X_tn, y_tn)
print(clf_nb.score(X_tn, y_tn))
print(clf_nb.score(X_tt, y_tt))

GaussianNB(priors=None)
0.942857142857
0.866666666667


In [8]:
# Decision Tree
from sklearn import tree
clf_dt = tree.DecisionTreeClassifier(criterion='gini')    # or criterion='entropy' (info gain)
print(clf_dt)

clf_dt.fit(X_tn, y_tn)
print(clf_dt.score(X_tn, y_tn))
print(clf_dt.score(X_tt, y_tt))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
1.0
0.9


In [9]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()
print(clf_rf)

clf_rf.fit(X_tn, y_tn)
print(clf_rf.score(X_tn, y_tn))
print(clf_rf.score(X_tt, y_tt))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.985714285714
0.9


In [10]:
# Gradient Boosting & Ada Boost
from sklearn.ensemble import GradientBoostingClassifier
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
print(clf_gb)

clf_gb.fit(X_tn, y_tn)
print(clf_gb.score(X_tn, y_tn))
print(clf_gb.score(X_tt, y_tt))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=1,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False)
1.0
0.9


# Clustering (Unsupervised Learning)

In [11]:
# Iris data
from sklearn import datasets
temp = datasets.load_iris()

In [12]:
X = temp.data[:100]
y = temp.target[:100]

# Train test split
from sklearn import model_selection
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
from sklearn import preprocessing
ss = preprocessing.StandardScaler()
print(ss)

X_tn = ss.fit_transform(X_tn)
X_tt = ss.transform(X_tt)

StandardScaler(copy=True, with_mean=True, with_std=True)


In [13]:
# k-Means
from sklearn.cluster import KMeans
clu_km = KMeans(n_clusters=2, random_state=None)
print(clu_km)

clu_km.fit(X_tn)
clu_km.predict(X_tt)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)


array([1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0])

In [14]:
y_tt    # Prediction should cluster similarly

array([1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0])

# Dimensionality Reduction (Unsupervised Learning)

In [15]:
# Iris data
from sklearn import datasets
temp = datasets.load_iris()

In [16]:
X = temp.data
y = temp.target

# Train test split
from sklearn import model_selection
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
from sklearn import preprocessing
ss = preprocessing.StandardScaler()
print(ss)

X_tn = ss.fit_transform(X_tn)
X_tt = ss.transform(X_tt)

StandardScaler(copy=True, with_mean=True, with_std=True)


In [17]:
# PCA
from sklearn.decomposition import PCA
dr_pca = PCA(n_components=3)
print(dr_pca)

X_tn_reduced = dr_pca.fit_transform(X_tn)
X_tt_reduced = dr_pca.transform(X_tt)

X_tt_reduced.shape

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)


(45, 3)

# Regression (Supervised Learning)

In [18]:
#  Boston house price data
from sklearn import datasets
temp = datasets.load_boston()

In [19]:
X = temp.data
y = temp.target
#np.column_stack([X, y])

# Train test split
from sklearn import model_selection
X_tn, X_tt, y_tn, y_tt = model_selection.train_test_split(X, y, test_size=0.3, random_state=27)

# Normalization
from sklearn import preprocessing
ss = preprocessing.StandardScaler()
print(ss)

X_tn = ss.fit_transform(X_tn)
X_tt = ss.transform(X_tt)

StandardScaler(copy=True, with_mean=True, with_std=True)


In [20]:
from sklearn.linear_model import LinearRegression
rgs_lin = LinearRegression()
print(rgs_lin)

rgs_lin.fit(X_tn, y_tn)
print(rgs_lin.score(X_tn, y_tn))
print(rgs_lin.score(X_tt, y_tt))

#rgs_lin.predict(X_tt)
#print(rgs_lin.coef_)
#print(rgs_lin.intercept_)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
0.746428671695
0.708936259481
