### ROC curve

In [None]:
#ROC curve
from sklearn.metrics import roc_curve, auc

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
fpr, tpr, _ = roc_curve(y_te_earn, y_score_earn)
roc_auc = auc(fpr, tpr)

In [None]:
import matplotlib.pyplot as plt

#Plot ROC curve
plt.figure(figsize = (7,7))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for earn')
plt.legend(loc="lower right")
plt.show()

### PR curve

In [None]:
#PR curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

average_precision = average_precision_score(y_te_earn, y_score_earn)
precision, recall, _ = precision_recall_curve(y_te_earn, y_score_earn)

In [None]:
#Plot the Precision-Recall curve
plt.step(recall, precision, color='b', alpha=0.2,where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('PR curve for earn'.format(average_precision))
plt.show()

### Test error

In [None]:
#Function that calculate the test error
def test_error(X_tr,y_tr,X_te,y_te,regmod):
    """
    Return the test error for square error loss
    
    Input: training and testing design matrix, X_tr ,X_te
            training and testing response vector, y_tr, y_te
            a regression model, regmod
            
    Output: scalar empirical risk
    """
    regmod.fit(X_tr,y_tr)
    y_hat = regmod.predict(X_te)
    return np.mean((y_hat - y_te)**2)

### scale

In [None]:
from sklearn.preprocessing import scale

X_tr = scale(X_tr)
X_te = scale(X_te)

### Confusion matrix
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    Code from http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')

### Logistics Regreesion
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

### Lasso Path

In [None]:
from sklearn.linear_model import lars_path

In [None]:
#Lasso path
alphas_lasso, active_lasso, coefs_lasso = linear_model.lars_path(X_tr, y_tr, method='lasso')

fig = plt.figure(figsize = (7,7))
plot_lars(coefs_lasso,title = 'Lasso Path')

In [None]:
#Calculate test error for each returned lasso coefficients
y_hat = np.array([0]*103)
TestEr_lasso = np.array([0.0]*103)

for i in range(103):
    y_hat = X_te @ coefs_lasso[:,i]
    TestEr_lasso[i] = np.mean((y_hat - y_te)**2)

In [None]:
#Draw the Test Error for Lasso Path
fig = plt.figure(figsize = (7,7))
plt.plot(alphas_lasso,TestEr_lasso)

plt.xlabel('alphas')
plt.ylabel('Test Error')
plt.title('Test Error for Lasso Path')
plt.show()

### Restricted OLS

In [None]:
from sklearn import linear_model

In [None]:
#Extract active sets and apply restricted OLS
X_tr_temp = np.array([])
X_te_temp = np.array([])
TestEr = np.array([0.0]*100)

ols = linear_model.LinearRegression(fit_intercept=False, normalize=True)
   
for i in range(len(active_lasso)):
    event = active_lasso[:i+1]
    X_tr_temp = X_tr[:,event]
    X_te_temp = X_te[:,event]
    
    TestEr[i] = test_error(X_tr_temp,y_tr,X_te_temp,y_te,ols)

In [None]:
#Size of variables in each fitting model
size = [i for i in range(len(active_lasso))]

#Draw the Test Error for restricted OLS
fig = plt.figure(figsize = (7,7))
plt.plot(size,TestEr)

plt.xlabel('size')
plt.ylabel('Test Error')
plt.title('Test Error for Restricted OLS')
plt.show()

### K-means Clustering
http://scikit-learn.org/stable/modules/clustering.html#k-means

http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

In [None]:
y_pred = KMeans(n_clusters=2, random_state=random_state).fit_predict(X)

### KNN Classification
http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
neigh = KNeighborsClassifier(n_neighbors = 2)
neigh.fit(X, y) 

In [None]:
neigh.predict()

In [None]:
#Draw the confusion matrix
fig = plt.figure(figsize = (10,10))
with plt.style.context(('seaborn-white')):
    plot_confusion_matrix(confusion_matrix(y_true,y_pred),[1,2,3,4,5])
plt.show()

### PCA
http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
http://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html#sphx-glr-auto-examples-datasets-plot-iris-dataset-py

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
pca.fit(X)

#Fit the model with X and apply the dimensionality reduction on X
fit_transform(X, y=None)

In [None]:
# plot the first three PCA dimensions
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = PCA(n_components=3).fit_transform(iris.data)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y,
           cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])

plt.show()

### LDA/QDA
http://scikit-learn.org/stable/modules/lda_qda.html

In [None]:
from scipy import linalg
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import colors

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
# Linear Discriminant Analysis
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
y_pred = lda.fit(X, y).predict(X)
splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1)
plot_lda_cov(lda, splot)
plt.axis('tight')

# Quadratic Discriminant Analysis
qda = QuadraticDiscriminantAnalysis(store_covariances=True)
y_pred = qda.fit(X, y).predict(X)
splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2)
plot_qda_cov(qda, splot)
plt.axis('tight')
plt.show()

### Decision Tree Classifier
http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state=0)
cross_val_score(clf, iris.data, iris.target, cv=10)

In [None]:
clf.fit(X,y)
clf.predict(X_pred)

### Random Forest Classifier

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)

clf.feature_importances_
clf.predict(X)pred

OOB(out of bag) errors

http://scikit-learn.org/stable/auto_examples/ensemble/plot_ensemble_oob.html#sphx-glr-auto-examples-ensemble-plot-ensemble-oob-py

In [None]:
# NOTE: Setting the `warm_start` construction parameter to `True` disables
# support for parallelized ensembles but is necessary for tracking the OOB
# error trajectory during training.
ensemble_clfs = [
    ("RandomForestClassifier, max_features='sqrt'",
        RandomForestClassifier(warm_start=True, oob_score=True,
                               max_features="sqrt",
                               random_state=RANDOM_STATE)),
    ("RandomForestClassifier, max_features='log2'",
        RandomForestClassifier(warm_start=True, max_features='log2',
                               oob_score=True,
                               random_state=RANDOM_STATE)),
    ("RandomForestClassifier, max_features=None",
        RandomForestClassifier(warm_start=True, max_features=None,
                               oob_score=True,
                               random_state=RANDOM_STATE))
]

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)


### Bagging & boosting
http://scikit-learn.org/stable/modules/ensemble.html

### 究极奥义: classification comparison

http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#sphx-glr-auto-examples-classification-plot-classifier-comparison-py