### Feature Analysis: Interpretability of ML models
March 2022

**Import Libraries**

In [None]:
import eli5
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string
import sys
import os
import warnings


from lightgbm.sklearn import LGBMClassifier
#from plotting_functions import *
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import (

    cross_val_score,
    cross_validate,
    train_test_split,
)

from sklearn.preprocessing import (
    StandardScaler,
)
from utils import *
from xgboost import XGBClassifier

%matplotlib inline
warnings.filterwarnings('ignore')


In [None]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

**Import and inspect Data**

In [None]:
path = os.getcwd()
path

In [None]:
path = os.getcwd()
data = pd.read_excel(os.path.join(path, "Feature Analysis/data/pumpkin.xlsx"), sheet_name = "Pumpkin_Seeds_Dataset")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data = data.rename(columns=str.lower)

In [None]:
data.area.value_counts()

**Define column types for column transformation**

In [None]:
numeric_columns = data.columns[1:-1]
target_column = "class"
drop_column = ["area"]

In [None]:
numeric_transformer = make_pipeline(StandardScaler())

preprocessor = make_column_transformer(
    ("drop", drop_column),
    (numeric_transformer, numeric_columns)
)

**Split Data**

In [None]:
train, test = train_test_split(data, test_size=0.2, random_state=2022)

X_train = train.drop(columns=[target_column])
y_train = train[target_column]

X_test = test.drop(columns=[target_column])
y_test = test[target_column]

**Check for class Balance**

In [None]:
train["class"].value_counts().plot(kind = "bar",
                                   title = "Count plot: CLass Balance",
                                   xlabel = "Seed Class",
                                   ylabel = "Count"
                                  );

In [None]:
# not a perfect balance, but we can work with this

**Initialize Models**

In [None]:
results = {}
scoring_metric = "accuracy"

pipe_lr = make_pipeline(
    preprocessor, LogisticRegression(max_iter=2000, random_state=2022)
)
pipe_rf = make_pipeline(preprocessor, RandomForestClassifier(random_state=2022))
pipe_xgb = make_pipeline(
    preprocessor, XGBClassifier(random_state=2022)
)
pipe_lgbm = make_pipeline(preprocessor, LGBMClassifier(random_state=2022))
classifiers = {
    "logistic regression": pipe_lr,
    "random forest": pipe_rf,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
}

In [None]:
for (name, model) in classifiers.items():
    results[name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score=True, scoring=scoring_metric
    )

In [None]:
pd.DataFrame(results)

**Score Interpretation**

- Random Forest is the best model
- Logistic regression gives a better score than the gradient boosting models.
- XGBoost and LightGBM are gives good scores without hyper parameters.

#### Logistic Regression : Feature Importance In Linear Model

In [None]:
pipe_lr.fit(X_train, y_train)

lr_data = {
    "coefficient": pipe_lr.named_steps["logisticregression"].coef_.flatten().tolist(),
    "magnitude": np.absolute(
        pipe_lr.named_steps["logisticregression"].coef_.flatten().tolist()
    ),
}
coef_lr = pd.DataFrame(lr_data, index= numeric_columns).sort_values(
    "coefficient", ascending=False
)

In [None]:
coef_lr

In [None]:
pipe_lr.classes_ 

### Interpretation of Coefficients of Logistics Model
NOTE : The default class set by the logistic regression model is : 
- `Çerçevelik` is the class with index 0 or the positive class (based on the magnitude of the coefficient) and
- `Ürgüp Sivrisi` the class with index 1 or negative class  (based on the magnitude of the coefficient)



### Increasing Compactness 

In [None]:
example_1a = X_test[255:256].copy()
example_1a['compactness'] = example_1a['compactness'] * 2

print(f' The prediction when compactness is increased is: {pipe_lr.predict(example_1a)[0]}')
example_1a

### Decreasing Compactness 

In [None]:
example_1b = X_test[255:256].copy()
example_1b['compactness'] = example_1b['compactness'] / 2


print(f' The prediction when compactness is decreased is: {pipe_lr.predict(example_1b)[0]}')
example_1b

### Increasing Convex Area

In [None]:
example_2a = X_test[255:256].copy()
example_2a['convex_area'] = example_2a['convex_area'] * 2

print(f'The prediction when convex area is increased is: {pipe_lr.predict(example_2a)[0]}')
example_2a

### Decreasing Convex Area

In [None]:
example_2b = X_test[255:256].copy()
example_2b['convex_area'] = example_2b['convex_area'] / 2

print(f' The prediction when convex area is decreased is: {pipe_lr.predict(example_2b)[0]}')
example_2b

The coefficients of the model is interpreted thus: 

- Increasing the `compactness` is likely to push the prediction towards `Çerçevelik`
- Increasing the `convex area` is likely to push the prediction towards `Ürgüp Sivrisi`
- Decreasing the `compactness` is likely to push the prediction towards `Ürgüp Sivrisi`
- Decreasing the `convex area` is likely to push the prediction towards `Çerçevelik`

### Random Forest: Feature Importance in Decision Tree Model

In [None]:
pipe_rf.fit(X_train, y_train)
data = {
    "Importance": pipe_rf.named_steps["randomforestclassifier"].feature_importances_,
}
pd.DataFrame(data=data, index= numeric_columns).sort_values(
    by="Importance", ascending=False
)[:10]


In [None]:
# feature importance

eli5.explain_weights( pipe_rf.named_steps["randomforestclassifier"], feature_names=numeric_columns.to_list())


 - The most important feature in prediction is the aspect ratio. To see how it affect the predictions, we will use the same test example

### Decreasing Aspect Ratio

In [None]:

example_rnd_dcr = X_test[255:256].copy()
example_rnd_dcr[''] = example_rnd_dcr['aspect_ration'] / 2

print(f'The prediction when aspect_ration is decreased is: {pipe_lr.predict(example_rnd_dcr)[0]}')

example_rnd_dcr


In [None]:


example_rnd_incr = X_test[255:256].copy()
example_rnd_incr['aspect_ration'] = example_rnd_incr['aspect_ration'] * 2

print(f'The prediction when aspect_ration is increased is: {pipe_lr.predict(example_rnd_incr)[0]}')
example_rnd_incr



From the prediciton examples above, we can see that increasing or decreasing `aspect_ratio` did not influence the prediction class