# Scikit-learn

This notebook is an introduction to [scikit-learn](https://scikit-learn.org/stable/), the main package for machine learning (excluding neural nets).

Most algorithms are available as objects with common methods like ``fit``, ``predict`` or ``transform``. 

The outline is the following:

**1. Supervised learning**

* classification
* cross-validation
* grid search
* feature selection
* soft classification
* regression

**2. Unsupervised learning**

* clustering
* anomaly detection
* dimensionality reduction

**3. Feature engineering**

* scaling
* categorical features
* text features

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, OrdinalEncoder, LabelBinarizer
from sklearn.decomposition import PCA
from sklearn.datasets import load_digits, load_wine
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer

## 1. Supervised learning

The objective of supervised learning is to map data ``X`` to target ``y``.

``algo.fit(X, y)``

## Classification

In [None]:
iris = sns.load_dataset('iris')

In [None]:
iris.info()

In [None]:
sns.pairplot(data=iris, hue="species", height=2);

In [None]:
# data
X = iris.drop(columns=['species'])
y = iris['species']

feature_names = np.array(X.columns)
X = np.array(X)
y = np.array(y)

In [None]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
algo = KNeighborsClassifier(n_neighbors=3)

In [None]:
# fit to training data
algo.fit(X_train, y_train)

In [None]:
# predict on test data
y_pred = algo.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
labels = np.unique(y_test)

In [None]:
confusion = confusion_matrix(y_test, y_pred, labels=labels)

In [None]:
sns.heatmap(confusion, square=True, annot=True, cmap='binary', xticklabels=labels, yticklabels=labels)
plt.ylabel('Truth')
plt.xlabel('Prediction');

## Cross-validation

In [None]:
# data
X = iris.drop(columns=['species']).to_numpy()
y = iris['species'].to_numpy()

In [None]:
algo = KNeighborsClassifier(n_neighbors=3)

In [None]:
# 5 splits of data
cross_val_score(algo, X, y, cv=5)

## Grid search

Search for the best parameters of the model.

### Single algorithm

In [None]:
algo = KNeighborsClassifier()
parameters = {'n_neighbors': np.arange(1, 10)}
grid = GridSearchCV(algo, parameters, cv=5)

In [None]:
grid.fit(X_train, y_train)

In [None]:
y_pred = grid.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
grid.best_params_

### Multiple algorithms

In [None]:
algos = [KNeighborsClassifier(), 
         SVC(), 
         RandomForestClassifier()]

In [None]:
# corresponding parameters
parameters = [{'n_neighbors': np.arange(1, 10)}, 
              {'C': [0.5, 1, 1.5], 'kernel': ['linear', 'poly']}, 
              {}]

In [None]:
scores = []
for algo, param in zip(algos, parameters):
    grid = GridSearchCV(algo, param, cv=5)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    scores.append(score)

In [None]:
scores

## Feature selection

In [None]:
# select from a model
model = RandomForestClassifier()
algo = SelectFromModel(model)

In [None]:
algo.fit(X_train, y_train)

In [None]:
features = algo.get_support()

In [None]:
print(feature_names[features])

In [None]:
X_train_ = algo.transform(X_train)
X_test_ = algo.transform(X_test)

In [None]:
algo = KNeighborsClassifier()
parameters = {'n_neighbors': np.arange(1, 10)}
grid = GridSearchCV(algo, parameters, cv=5)

In [None]:
grid.fit(X_train_, y_train)

In [None]:
y_pred = grid.predict(X_test_)

In [None]:
accuracy_score(y_test, y_pred)

## Soft classification

In [None]:
algo = SVC(probability=True)

In [None]:
algo.fit(X_train, y_train)

In [None]:
probs = algo.predict_proba(X_test)

In [None]:
probs.shape

In [None]:
confidence = np.max(probs, axis=1)

In [None]:
sns.histplot(confidence, bins=10);

In [None]:
dims = [2, 3]
for label in labels:
    mask = y_test==label
    plt.scatter(X_test[mask,dims[0]], X_test[mask,dims[1]], s=200*(confidence[mask] - 0.5))
plt.xlabel(iris.columns[dims[0]])
plt.ylabel(iris.columns[dims[1]])
plt.show()

## Regression

### Linear regression

In [None]:
# data
x = np.linspace(0, 10, 20)
y = 2 * x - 10
y += 2 * np.random.randn(len(x))

In [None]:
plt.scatter(x, y);

In [None]:
algo = LinearRegression()

In [None]:
# shape = (n_samples, n_features)
X = x.reshape(-1, 1)

In [None]:
algo.fit(X, y)

In [None]:
y_pred = algo.predict(X)

In [None]:
plt.scatter(x, y)
plt.plot(x, y_pred, color='k');

### Polynomial regression

In [None]:
# data
x = np.linspace(0, 10, 50)
x[x==0] = 10**-6
y = np.sin(x) / x
y += 0.1 * np.random.randn(len(x))

In [None]:
plt.scatter(x, y);

In [None]:
algo = make_pipeline(PolynomialFeatures(5), LinearRegression())

In [None]:
X = x.reshape(-1,1)

In [None]:
algo.fit (X, y)

In [None]:
y_pred = algo.predict(X)

In [None]:
plt.scatter(x, y)
plt.plot(x, y_pred, color='k');

## 2. Unsupervised learning

The objective of unsupervised learning is to understand the data structure ``X`` **without** target ``y``.

``algo.fit(X)``

## Clustering

In [None]:
X = iris.drop(columns=['species']).to_numpy()
y = iris['species'].to_numpy()

In [None]:
n_clusters = 3
algo = KMeans(n_clusters)

In [None]:
y_pred = algo.fit_predict(X[:, features])

In [None]:
dims = np.where(features)[0]
colors = ['b', 'r', 'g']
for i in range(n_clusters):
    mask = y_pred==i
    plt.scatter(X[mask,dims[0]], X[mask,dims[1]])
plt.xlabel(iris.columns[dims[0]])
plt.ylabel(iris.columns[dims[1]])
plt.show()

## Anomaly detection

In [None]:
algo = IsolationForest(contamination=0.05)

In [None]:
normal = algo.fit_predict(X[:, features])

In [None]:
dims = np.where(features)[0]
colors = ['b', 'r', 'g']
for label, color in zip(labels, colors):
    mask = (y==label) & (normal > 0)
    plt.scatter(X[mask,dims[0]], X[mask,dims[1]], color=color)
    mask = (y==label) & (normal < 0)
    plt.scatter(X[mask,dims[0]], X[mask,dims[1]], color=color, alpha=0.5)
plt.xlabel(iris.columns[dims[0]])
plt.ylabel(iris.columns[dims[1]])
plt.show()

## Dimensionality reduction

In [None]:
digits = load_digits()

In [None]:
X = digits.data
y = digits.target

In [None]:
X.shape

In [None]:
def show_digits(X, y):
    img = np.zeros((100, 100))
    for i in range(10):
        index = np.where(y == i)[0][:10]
        for j, sample in enumerate(index):
            img[i*10+1:i*10+9,j*10+1:j*10+9] = X[sample].reshape((8, 8))
    plt.imshow(img, cmap='binary')
    plt.xticks([])
    plt.yticks(5 + 10*np.arange(10), np.arange(10))

In [None]:
show_digits(X, y)

In [None]:
algo = PCA(n_components=8)

In [None]:
X_reduced = algo.fit_transform(X)

In [None]:
X_reduced.shape

In [None]:
plt.bar(np.arange(8) + 1, algo.explained_variance_ratio_);

In [None]:
X_inv = algo.inverse_transform(X_reduced)

In [None]:
show_digits(X_inv, y)

## 3. Feature engineering

The objective of feature engineering is to transform data into a proper array of numerical values ``X``.

## Scaling

In [None]:
algo = StandardScaler()

In [None]:
wine = load_wine()

In [None]:
X = wine.data
y = wine.target

In [None]:
sns.catplot(data=pd.DataFrame(X));

In [None]:
X_scale = algo.fit_transform(X)

In [None]:
sns.catplot(data=pd.DataFrame(X_scale));

In [None]:
# application
algo = KNeighborsClassifier(n_neighbors=3)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [None]:
algo.fit(X_train, y_train)

In [None]:
algo.score(X_test, y_test)

In [None]:
# scaling
pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_test, y_test)

## Categorical features

In [None]:
df = sns.load_dataset("titanic")

In [None]:
df = df.drop(columns=["deck", "embark_town", "class", "alive"])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df["embarked"] = df["embarked"].fillna("Unknown")
df["age"] = df["age"].fillna(-1)

In [None]:
y = np.array(df["survived"])
df = df.drop(columns="survived")

In [None]:
algo = OrdinalEncoder()

In [None]:
X = algo.fit_transform(df)

In [None]:
X.shape

In [None]:
for i, col in enumerate(df.columns):
    print(col, len(set(X[:, i])))

In [None]:
algo = OneHotEncoder()

In [None]:
X_bin = algo.fit_transform(df)

In [None]:
X_bin.shape

In [None]:
X_bin

In [None]:
categories = algo.categories_

In [None]:
{col: len(cat) for col, cat in zip(df.columns, categories)}

In [None]:
# specific columns
algo = make_column_transformer((OneHotEncoder(sparse=False), ['embarked', 'who']))

In [None]:
X_bin = algo.fit_transform(df)

In [None]:
X_bin.shape

In [None]:
X_bin[0]

In [None]:
df.head()

In [None]:
# keep others
tf = make_column_transformer((OneHotEncoder(drop='if_binary', sparse=False), 
                                ['sex','embarked', 'who', 'adult_male', 'alone']), remainder='passthrough')

In [None]:
X_bin = tf.fit_transform(df)

In [None]:
X_bin.shape

In [None]:
algo = RandomForestClassifier()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [None]:
algo.fit(X_train, y_train)

In [None]:
algo.score(X_test, y_test)

## Text features

In [None]:
algo = CountVectorizer()

In [None]:
X = algo.fit_transform([digits.DESCR, wine.DESCR])

In [None]:
X.shape

In [None]:
counts = X.toarray().sum(axis=0)

In [None]:
features_names = np.array(algo.get_feature_names())

In [None]:
# top words
features_names[np.argsort(-counts)[:5]]