# Usefull stuff for the upcoming project

In [None]:
# Add some basic lib
import numpy as np
import pandas as pd

## Pipeline preprocessor

First seen in *04. Logistic Regression / 07. Exercise - Heart disease diagnosis / 05 Solution - Heart disease.ipynb*



In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# One-hot encoding
onehot_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

# Numerical features
other_columns = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']

# Preprocessor
preprocessor = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), onehot_columns),
    ('other', 'passthrough', other_columns)
])

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# k-NN estimator
knn_estimator = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()), # Standardize features before k-NN
    ('knn', KNeighborsClassifier())
])



## Get proportion of different values in a np.Series

Using the pandas.value_counts() function with *nomralize* parameter set to *True* displays the percentage of each values instead of simply the number of occurence.

In [None]:
y=[1,1,1,2,2,2,3,4,5,5]

# Proportion of features in each class
pd.value_counts(y, normalize=True)

## Draw a decision tree

You can find how to do that in *05. Decision trees and SVMs/02. Decision trees/Learn.ipynb*



In [None]:
from sklearn.tree import export_graphviz



# Export decision tree
dot_data = export_graphviz(
    dt, out_file=None,
    feature_names=encoded_df.drop('survived',axis=1).columns, class_names=['died', 'survived'],
    filled=True, rounded=True, proportion=True   
)

import graphviz

# Display decision tree
graphviz.Source(dot_data)

## Make cross tab between two columns of an pandas dataframe

You can find how to do that in *05. Decision trees and SVMs/02. Decision trees/Learn.ipynb*

In [None]:
# Cross tabulation of sex and survived
crosstab = pd.crosstab(
    index=data_df.sex,
    columns=data_df.survived,
    normalize='index' # Normalize by sex
)
crosstab

## Do not forget to scale values !!

In [None]:
from sklearn.preprocessing import scale

X = scale(data_df.drop('y', axis=1).values) # Rescale input data

## Some default values for various classifiers 
> * Logistic regression
> * SVM with linear kernel
> * *k*-NN
> * Decision tree
> * Random forest
> * SVM with RBF kernel

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Logistic regression
logreg = LogisticRegression(solver='liblinear', random_state=0)

# SVM with linear kernel
linear_svc = LinearSVC(random_state=0)

# k-NN
knn = KNeighborsClassifier(n_neighbors=50)

# Decision tree
dt = DecisionTreeClassifier(max_depth=10, random_state=0)

# Random forest
rf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0)

# SVM with RBF kernel
rbf_svc = SVC(kernel='rbf', C=10, gamma=1, random_state=0)



## Draw a cluster found using k-means

The for loop depends on the number of cluster (centroids) you'd like to draw

In [None]:
# Plot clusters
for cluster in [0, 1, 2]:
    # Get points in this cluster
    idx = (kmeans.labels_ == cluster)

    # Plot points
    plt.scatter(
        X[idx, 2], # Third column: petal length
        X[idx, 3], # Fourth column: petal width
        label='cluster {}'.format(cluster)
    )

    # Plot centroid
    centroid = kmeans.cluster_centers_[cluster]
    plt.plot(centroid[2], centroid[3], marker='*', color='black', markersize=18)

## Draw a pairplot

This kind of diagram is very useful when we'd like to find correlation between N features of a dataset.

The following example comes from *06. Clustering and dimensionality reduction / 05. Dimension reduction with PCA*, bases on a wine dataset.

The *hue* parameter is used to define the color of the plots based on the value in that column

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

# Create pairplot
sns.pairplot(
    data_df,
    # Variables on the x-axes
    x_vars=['alcohol', 'phenols', 'color'],
    # Variables on the y-axes
    y_vars=['alcohol', 'phenols', 'color'],
    # Use a different color for each kind of wine
    hue='kind'
)
plt.show()