# Python Scikit-Learn CheatSheet

<img src="../sample_files/logos/scikit.svg" width="200" />

Scikit-learn is an open source Python library that implements a range of machine learning, preprocessing, cross-validation and visualization algorithms using a unified interface.

## Basic Example

In [6]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
X, y = iris.data[:, :2], iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

0.631578947368421

## Loading Data
Your data needs to be numeric and stored as NumPy arrays or SciPy sparse matrices. Other types that are convertible to numeric arrays, such as Pandas DataFrame, are also acceptable.

In [27]:
import numpy as np
X = np.random.random((11,5))
print("X = \n{}".format(X))
y = np.array(['M','M','F','F','M','F','M','M','F','F','F'])
X[X < 0.7] = 0
print("X = \n{}".format(X))
print("y = \n{}".format(y))

X = 
[[0.83253829 0.50398518 0.0931338  0.59153924 0.2563762 ]
 [0.09056077 0.42845796 0.65820855 0.14151847 0.86606923]
 [0.57335524 0.31312506 0.16328355 0.01007058 0.3900431 ]
 [0.85653543 0.20389465 0.22201192 0.70745912 0.46868355]
 [0.45783324 0.57638144 0.07202692 0.01713034 0.54638836]
 [0.89436925 0.65312998 0.28299846 0.58465384 0.52071888]
 [0.61164555 0.00313316 0.28995923 0.4243056  0.9690657 ]
 [0.02782185 0.49096337 0.79774603 0.05676063 0.8814525 ]
 [0.15854603 0.71335871 0.1118313  0.52119414 0.02201662]
 [0.10825882 0.11629826 0.92338134 0.61426038 0.87361918]
 [0.52808764 0.47773672 0.97026496 0.0853796  0.92004588]]
X = 
[[0.83253829 0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.86606923]
 [0.         0.         0.         0.         0.        ]
 [0.85653543 0.         0.         0.70745912 0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.89436925 0.         0.         0.         0.        ]
 [0

## Training and Test Data

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=0)

## Preprocessing the Data

### Standardization

In [29]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
standardized_X = scaler.transform(X_train)
standardized_X_test = scaler.transform(X_test)
print("standardized_X = \n{}".format(standardized_X))
print("standardized_X_test = \n{}".format(standardized_X_test))

standardized_X = 
[[-0.77405591 -0.37796447  1.9450897  -0.37796447  1.02199553]
 [-0.77405591 -0.37796447 -0.57371999 -0.37796447  1.1296252 ]
 [-0.77405591 -0.37796447 -0.57371999 -0.37796447  0.90348245]
 [-0.77405591 -0.37796447  1.49723022 -0.37796447  0.93725851]
 [-0.77405591  2.64575131 -0.57371999 -0.37796447 -0.99809042]
 [ 1.27903779 -0.37796447 -0.57371999  2.64575131 -0.99809042]
 [ 1.22151725 -0.37796447 -0.57371999 -0.37796447 -0.99809042]
 [ 1.3697245  -0.37796447 -0.57371999 -0.37796447 -0.99809042]]
standardized_X_test = 
[[-0.77405591 -0.37796447 -0.57371999 -0.37796447 -0.99809042]
 [-0.77405591 -0.37796447  1.82337973 -0.37796447  0.92005938]
 [-0.77405591 -0.37796447 -0.57371999 -0.37796447 -0.99809042]]


### Normalization

In [30]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
normalized_X = scaler.transform(X_train)
normalized_X_test = scaler.transform(X_test)
print("normalized_X = \n{}".format(normalized_X))
print("normalized_X_test = \n{}".format(normalized_X_test))

normalized_X = 
[[0.         0.         0.72563616 0.         0.6880786 ]
 [0.         0.         0.         0.         1.        ]
 [0.         0.         0.         0.         1.        ]
 [0.         0.         0.67102496 0.         0.74143476]
 [0.         1.         0.         0.         0.        ]
 [0.77101197 0.         0.         0.63682065 0.        ]
 [1.         0.         0.         0.         0.        ]
 [1.         0.         0.         0.         0.        ]]
normalized_X_test = 
[[0.         0.         0.         0.         0.        ]
 [0.         0.         0.72640937 0.         0.68726227]
 [0.         0.         0.         0.         0.        ]]


### Binarization

In [31]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X)
binary_X = binarizer.transform(X)
print("binary_X = \n{}".format(binary_X))

binary_X = 
[[1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [1. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 1.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 1.]
 [0. 0. 1. 0. 1.]]


### Encoding Categorical Features

In [32]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
y = enc.fit_transform(y)
print("y = {}".format(y))

y = [1 1 0 0 1 0 1 1 0 0 0]


### Imputing Missing Values

In [33]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=0, strategy='mean', axis=0)
imp.fit_transform(X_train)

array([[0.86114765, 0.71335871, 0.97026496, 0.70745912, 0.92004588],
       [0.86114765, 0.71335871, 0.8840055 , 0.70745912, 0.9690657 ],
       [0.86114765, 0.71335871, 0.8840055 , 0.70745912, 0.86606923],
       [0.86114765, 0.71335871, 0.79774603, 0.70745912, 0.8814525 ],
       [0.86114765, 0.71335871, 0.8840055 , 0.70745912, 0.90915833],
       [0.85653543, 0.71335871, 0.8840055 , 0.70745912, 0.90915833],
       [0.83253829, 0.71335871, 0.8840055 , 0.70745912, 0.90915833],
       [0.89436925, 0.71335871, 0.8840055 , 0.70745912, 0.90915833]])

### Generating Polynomial Features

In [34]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
poly.fit_transform(X)

array([[1.        , 0.83253829, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.48726258],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.        , 0.71335871, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.50887462],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.65924588]])

## Create your Model

### Supervised Learning Estimators

In [39]:
# Linear Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)
# Support Vector Machines (SVM)
from sklearn.svm import SVC
svc = SVC(kernel='linear')
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
# KNN
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

print("lr = {}".format(lr))
print("svc = {}".format(svc))
print("gnb = {}".format(gnb))
print("knn = {}".format(knn))

lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)
svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
gnb = GaussianNB(priors=None)
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')


### Unsupervised Learning Estimators

In [40]:
# Principal Component Analysis (PCA)
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
# K Means
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3, random_state=0)

print("pca = {}".format(pca))
print("k_means = {}".format(k_means))

pca = PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
k_means = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)


## Model Fitting

In [41]:
# Supervised learning
lr.fit(X, y)                             # Fit the model to the data
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)
# Unsupervised Learning
k_means.fit(X_train)                     # Fit the model to the data
pca_model = pca.fit_transform(X_train)   # Fit to data, then transform it

print("lr = {}".format(lr))
print("svc = {}".format(svc))
print("knn = {}".format(knn))
print("k_means = {}".format(k_means))
print("pca_model = {}".format(pca_model))

lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)
svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
k_means = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)
pca_model = [[ 0.83964032 -0.32430267 -0.2963779  -0.00320279]
 [ 0.4570015   0.18976501  0.43334823  0.01575435]
 [ 0.38781435  0.20498165  0.38079836  0.00970049]
 [ 0.73982508 -0.22590889 -0.19076596 -0.00261282]
 [-0.2295792   0.74396939 -0.38293279  0.06603886]
 [-0.81145423 -0.30361687  0.04071397  0.42316041]
 [-0.6

## Prediction

In [43]:
# Supervised Estimators
y_pred = svc.predict(np.random.random((2,5))) # Predict labels
print("svc.predict y_pred = {}".format(y_pred))
y_pred = lr.predict(X_test)                   # Predict labels
print("lr.predict y_pred = {}".format(y_pred))
y_pred = knn.predict_proba(X_test)            # Estimate probability of a label
print("knn.predict_proba y_pred = {}".format(y_pred))
# Unsupervised Estimators
y_pred = k_means.predict(X_test)              # Predict labels in clustering algos
print("k_means.predict y_pred = {}".format(y_pred))


svc.predict y_pred = ['F' 'F']
lr.predict y_pred = [0.52566384 0.26923913 0.52566384]
knn.predict_proba y_pred = [[0.4 0.6]
 [0.4 0.6]
 [0.4 0.6]]
k_means.predict y_pred = [0 1 0]


## Evaluate your Model's Performance

### Classification Metrics

In [48]:
# Accuracy Score
knn.score(X_test, y_test)                         # Estimator score method
from sklearn.metrics import accuracy_score        # Metric scoring functions
accuracy_score(y_test, y_pred)
enc = LabelEncoder()
y_test = enc.fit_transform(y_test)
# Classification Report
from sklearn.metrics import classification_report # Precision, recall, f1-score and support
print(classification_report(y_test, y_pred))
# Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.50      0.50      0.50         2
          1       0.00      0.00      0.00         1

avg / total       0.33      0.33      0.33         3

[[1 1]
 [1 0]]


  score = y_true == y_pred


### Regression Metrics

In [54]:
# Mean Absolute Error
from sklearn.metrics import mean_absolute_error
y_true = [3, -0.5, 2]
mean_absolute_error = mean_absolute_error(y_true, y_pred)
# Mean Squared Error
from sklearn.metrics import mean_squared_error
mean_squared_error = mean_squared_error(y_test, y_pred)
# RÂ² Score
from sklearn.metrics import r2_score
r2_score = r2_score(y_true, y_pred)

print("y_true = {}".format(y_true))
print("y_pred = {}".format(y_pred))
print("mean_absolute_error = {}".format(mean_absolute_error))
print("mean_squared_error = {}".format(mean_squared_error))
print("r2_score = {}".format(r2_score))

y_true = [3, -0.5, 2]
y_pred = [0 1 0]
mean_absolute_error = 2.1666666666666665
mean_squared_error = 0.6666666666666666
r2_score = -1.3461538461538463


### Clustering Metrics

In [56]:
# Adjusted Rand Index
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score = adjusted_rand_score(y_true, y_pred)
# Homogeneity
from sklearn.metrics import homogeneity_score
homogeneity_score = homogeneity_score(y_true, y_pred)
# V-measure
from sklearn.metrics import v_measure_score
v_measure_score = v_measure_score(y_true, y_pred)

print("y_true = {}".format(y_true))
print("y_pred = {}".format(y_pred))
print("adjusted_rand_score = {}".format(adjusted_rand_score))
print("homogeneity_score = {}".format(homogeneity_score))
print("v_measure_score = {}".format(v_measure_score))

y_true = [3, -0.5, 2]
y_pred = [0 1 0]
adjusted_rand_score = 0.0
homogeneity_score = 0.5793801642856952
v_measure_score = 0.7336804366512111


### Cross-Validation

In [57]:
from sklearn.cross_validation import cross_val_score
print(cross_val_score(knn, X_train, y_train, cv=4))
print(cross_val_score(lr, X, y, cv=2))

[0.5 0.5 0.5 0.5]
[-269.65711954   -0.95080087]




## Tune Your Model

### Grid Search

In [58]:
from sklearn.grid_search import GridSearchCV
params = {"n_neighbors": np.arange(1,3),
          "metric": ["euclidean", "cityblock"]}
grid = GridSearchCV(estimator=knn,
                    param_grid=params)
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_.n_neighbors)

0.625
2




### Randomized Parameter Optimization

In [59]:
from sklearn.grid_search import RandomizedSearchCV
params = {"n_neighbors": range(1,5),
          "weights": ["uniform", "distance"]}
rsearch = RandomizedSearchCV(estimator=knn,
                             param_distributions=params,
cv=4,
n_iter=8,
random_state=5)
rsearch.fit(X_train, y_train)
print(rsearch.best_score_)

0.625
