# Classification

## Load pandas and sklearn

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,balanced_accuracy_score,roc_curve,auc,f1_score,precision_recall_fscore_support,confusion_matrix,plot_roc_curve,plot_confusion_matrix
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

from patsy import dmatrices, dmatrix


## Loading the R magic along with the tidyverse

In [2]:
%load_ext rpy2.ipython
%R R.version.string
%R library(tidyverse)

R[write to console]: ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

R[write to console]: ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
✔ tibble  3.1.3     ✔ dplyr   1.0.7
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   1.4.0     ✔ forcats 0.5.1

R[write to console]: ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()



0,1,2,3,4,5,6
'forcats','stringr','dplyr',...,'datasets','methods','base'




## Training and testing sets

* Load the data
* Define the model matrices
* Split into training and testing

In [3]:
data = pd.read_csv('data/iris_tidy.csv')

def get_formula(response_col, data):
    return ' ~ 0 + ' + ' + '.join(data.columns.difference([response_col]))

formula = get_formula('Species', data)
x = dmatrix(formula, data)

# lb = LabelBinarizer()
le = LabelEncoder()
y = le.fit_transform(data['Species'])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [4]:
# define model
models = [
    OneVsOneClassifier(LogisticRegression()),
    LogisticRegression(multi_class='ovr'),
    LogisticRegression(multi_class='multinomial')
]

def pipe(model):
  return Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('clf', model)
  ])

scaled_models = [pipe(model) for model in models]

def metrics(y_test, y_pred):
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Balanced Accuracy:', balanced_accuracy_score(y_test, y_pred))
    # print('F1 Score:', f1_score(y_test, y_pred, average='weighted'))
    print('Precision/Recall/F1 Score:', precision_recall_fscore_support(y_test, y_pred, average='weighted')[:-1])
    # print('Precision/Recall/F1 per class:')
    # print(np.vstack(precision_recall_fscore_support(y_test, y_pred, average=None)[:-1]))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))

for model in scaled_models:
  print(model)
  model.fit(x_train, y_train)
  y_pred = model.predict(x_test)
  metrics(y_test, y_pred)

Pipeline(steps=[('scaler', StandardScaler()),
                ('clf', OneVsOneClassifier(estimator=LogisticRegression()))])
Accuracy: 1.0
Balanced Accuracy: 1.0
Precision/Recall/F1 Score: (1.0, 1.0, 1.0)
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Pipeline(steps=[('scaler', StandardScaler()),
                ('clf', LogisticRegression(multi_class='ovr'))])
Accuracy: 0.9666666666666667
Balanced Accuracy: 0.9629629629629629
Precision/Recall/F1 Score: (0.9694444444444444, 0.9666666666666667, 0.9664109121909632)
Confusion Matrix:
[[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
Pipeline(steps=[('scaler', StandardScaler()),
                ('clf', LogisticRegression(multi_class='multinomial'))])
Accuracy: 1.0
Balanced Accuracy: 1.0
Precision/Recall/F1 Score: (1.0, 1.0, 1.0)
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [5]:
# Let's just check the multicolinearity
from statsmodels.stats.outliers_influence import \
    variance_inflation_factor as vif

print([vif(x_train, i) for i in range(x_train.shape[1])])

[183.15310137563065, 58.21959125863561, 265.16537796870114, 95.55847291989042]


In [6]:
from sklearn.model_selection import GridSearchCV, KFold

fold = KFold(n_splits=5, shuffle=True, random_state=42)
grid = {
  'clf__C': np.power(10.0, np.arange(-4, 3)),
  'clf__multi_class': ['multinomial'],
  'clf__solver': ['saga'],
  'clf__penalty': ['l1'],
  'clf__max_iter': [10000]
}

clf = pipe(LogisticRegression())
# clf.fit(x_train, y_train)
gs = GridSearchCV(clf, grid,scoring='f1_weighted',cv=fold, verbose=1)
gs.fit(x_train, y_train)
print ('gs.best_score_:', gs.best_score_)
print ('gs.best_params_:', gs.best_params_)
print('gs.best_estimator_["clf"].coef_:', gs.best_estimator_['clf'].coef_)
y_pred = gs.predict(x_test)
print(metrics(y_test, y_pred))

Fitting 5 folds for each of 7 candidates, totalling 35 fits
gs.best_score_: 0.9504093976771995
gs.best_params_: {'clf__C': 10.0, 'clf__max_iter': 10000, 'clf__multi_class': 'multinomial', 'clf__penalty': 'l1', 'clf__solver': 'saga'}
gs.best_estimator_["clf"].coef_: [[-4.06767085 -3.08136947  0.          1.51433645]
 [ 0.          0.          1.15988466  0.        ]
 [ 9.5200361   8.20253017  0.         -2.1390414 ]]
Accuracy: 1.0
Balanced Accuracy: 1.0
Precision/Recall/F1 Score: (1.0, 1.0, 1.0)
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
None


In [7]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state=0)
forest.fit(x_train, y_train)
y_pred = forest.predict(x_test)
print(metrics(y_test, y_pred))

from sklearn.inspection import permutation_importance
result = permutation_importance(
    forest, x_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
print(result.importances_mean)

Accuracy: 1.0
Balanced Accuracy: 1.0
Precision/Recall/F1 Score: (1.0, 1.0, 1.0)
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
None
[0.17583333 0.45166667 0.01583333 0.01583333]
