Modeling Preference Learning using Bayesian Inference

Developer: Tanmoy Das <br>
Date: June 13, 2022

Outline: 
- Data processing <br>
- Modeling <br>
- Model Assessment and Validation <br>
- Model Selection

Content for the manuscript:
- Figure 6: Visualizing performance of the full model; ROC curve of multiclass of MCR, CDU and ISB are shown in (a), (b), (c), respectively.

# Data  

In [None]:
# Import required Python libraries
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

# Modeling
from itertools import cycle

from sklearn.naive_bayes import GaussianNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier 

# Metric
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, label_ranking_average_precision_score
from sklearn import metrics

# Visualization
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc


# Saving model
import joblib

In [2]:
# import data
data_engineered_PLeR_modeling = pd.read_excel("Inputs/data_engineered_PLeR_Modeling input.xlsx", index_col='Scene no.').copy()
data_engineered_PLeR_modeling = data_engineered_PLeR_modeling.rename({'mcr_DT_output': 'MCR options', 'cdu_DT_output': 'CDU options', 'isb_DT_output':'ISB options'}, axis='columns')
display(data_engineered_PLeR_modeling)
display(data_engineered_PLeR_modeling.info())
display(data_engineered_PLeR_modeling.columns)

FileNotFoundError: [Errno 2] No such file or directory: 'Inputs/data_engineered_PLeR_Modeling input.xlsx'

In [None]:
# Features & Target variables
X = data_engineered_PLeR_modeling.drop(columns=['MCR options', 'CDU options', 'ISB options'])
y = data_engineered_PLeR_modeling[['MCR options', 'CDU options', 'ISB options']]

In [None]:
# Save X & y
X.to_excel('Inputs/X.xlsx')
y.to_excel('Inputs/y.xlsx')

## Model for classifying MCR

### Data (Training and Test)

In [None]:
# Binarize multi-classes (e.g. [1, 2, 2, 3] will be converted into [[1, 0, 0, 0],[0, 1, 1, 0],[0, 1, 1, 0],[0, 0, 0, 1]]
# Binarization is needed to calculate FPR & other metric & to draw ROC curve for Multi-class (we dont need binarization for binary classification)
# Label binarizer will broadcast 1D array into multiclass array
# y = [0 ,1 , 2, 1, 0]
# y_label_binarize = [ [1, 0, 0, 0, 0], []] # similar data/array dimension changes happen in one-hot encoding

# Binarize the output
y_m_b = label_binarize(y['MCR options'], classes=[10, 8, 2, -2])
n_classes = y_m_b.shape[1]

X_train, X_test, y_train, y_test = train_test_split(X, y_m_b, test_size=0.20, random_state=12)


### Model Training

In [None]:
# modeling
model_GB_ins =  OneVsRestClassifier(GaussianNB())
model_GB = model_GB_ins.fit(X_train, y_train)

# model_gb_m = MultiOutputClassifier(model_GB).fit(X_train, y_train)

#MultiOutputClassifier(KNeighborsClassifier()).fit(X, y)
# https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html
#Predict
y_pred = model_GB.predict(X_test)

y_score = model_GB.predict_proba(X_test)

# You should pass into roc_curve function result of decision_function (or some of two columns from predict_proba result) instead of actual predictions
# https://stackoverflow.com/questions/33208897/how-to-interpret-this-triangular-shape-roc-auc-curve/33218642#33218642

### Model Assessment

In [None]:
from sklearn.metrics import roc_auc_score

macro_roc_auc_ovo = roc_auc_score(y_test, y_pred, multi_class="ovo", average="macro")
weighted_roc_auc_ovo = roc_auc_score(
    y_test, y_score, multi_class="ovo", average="weighted"
)
macro_roc_auc_ovo
weighted_roc_auc_ovo

#### ROC with class name

In [None]:
# Binarize the output
y_m_b = label_binarize(y['MCR options'], classes=[10, 8, 2, -2])
n_classes = y_m_b.shape[1]
X_train, X_test, y_train, y_test = train_test_split(X, y_m_b, test_size=0.20, random_state=12)

# MCR 
classifier = OneVsRestClassifier(GaussianNB()).fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)
y_pred = classifier.predict(X_test)

fig2 = plt.figure()
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
color_c = ['green', 'lightgreen', 'blue', 'lightgray']
class_c = ['OK', 'Consider', 'Go next season','Unknown']
lw = [3,3,1,3]
#linestyle = ['solid', 'dashed', '-.', '-.']
# opacity/ transparency in lineplot ++

for i, color in zip(range(n_classes), color_c):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw[i], #, linestyle=linestyle[i]
             label=class_c[i]) 
plt.plot([0, 1], [0, 1], '--', color= 'lightgray',lw=1)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
#plt.xlabel('False Positive Rate')
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")

plt.show()
fig2.savefig('Outputs/ROC curve mcr, full, name.png', dpi = 600)

####  ROC curve with AUC values

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
fig = plt.figure()
plt.plot(
    fpr["micro"],
    tpr["micro"],
    label="micro-avg (area = {0:0.2f})".format(roc_auc["micro"]),
    color="black",
    linestyle=":",
    linewidth=2,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label="macro-avg (area = {0:0.2f})".format(roc_auc["macro"]),
    color="navy",
    linestyle=":",
    linewidth=2,
)

colors = cycle(['limegreen', 'lightgreen', 'gray', 'lightgray'])
for i, color in zip(range(n_classes), colors):
    plt.plot(
        fpr[i],
        tpr[i],
        color=color,
        lw=1,
        label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),
    )

plt.plot([0, 1], [0, 1], "--", color='lightgray')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("MCR")
plt.legend(loc="lower right")
plt.show()

#### Confusion metric

In [None]:
cm = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
cm_df = pd.DataFrame(cm,
                     index = ['OK', 'Consider', 'Go next season', 'Unknown'], 
                     columns = ['OK', 'Consider', 'Go next season', 'Unknown'])
cm_df

#### LRAP

In [None]:
label_ranking_average_precision_score(y_test, y_score)

## Model for classifying CDU

In [None]:
display(y['CDU options'].value_counts())

### Data, Modeling and Assessment

In [None]:
# Binarize the output
y_c_b = label_binarize(y['CDU options'], classes=[10, 8, -2, -10])
n_classes = y_c_b.shape[1]
X_train, X_test, y_train, y_test = train_test_split(X, y_c_b, test_size=0.20, random_state=12)

# CDU 
classifier = OneVsRestClassifier(GaussianNB()).fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)
y_pred = classifier.predict(X_test)

fig2 = plt.figure()
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
color_c = ['green', 'blue', 'red', 'lightgray']
class_c = ['OK', 'Consider', 'Not recommended','Unknown']
#linestyle = ['solid', 'dashed', '-.', '-.']
# opacity/ transparency in lineplot ++
lw = [5,2,2,2]
for i, color in zip(range(n_classes), color_c):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw[i], #, linestyle=linestyle[i]
             label=class_c[i]) 
plt.plot([0, 1], [0, 1], '--', color= 'lightgray',lw=1)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
#plt.xlabel('False Positive Rate')
plt.ylabel("True Positive Rate")

plt.legend(loc="lower right")

plt.show()
fig2.savefig('Outputs/ROC curve cdu, full, name.png', dpi = 600)

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])


# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
fig3 = plt.figure()
plt.plot(
    fpr["micro"],
    tpr["micro"],
    label="micro-avg (area = {0:0.2f})".format(roc_auc["micro"]),
    color="black",
    linestyle=":",
    linewidth=2,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label="macro-avg (area = {0:0.2f})".format(roc_auc["macro"]),
    color="navy",
    linestyle=":",
    linewidth=2,
)

colors = cycle(['limegreen', 'lightgreen', 'gray', 'lightgray'])
for i, color in zip(range(n_classes), colors):
    plt.plot(
        fpr[i],
        tpr[i],
        color=color,
        lw=3,
        label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),
    )

plt.plot([0, 1], [0, 1], "--", color='lightgray')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
#plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("CDU")
plt.legend(loc="lower right")
plt.show()

In [None]:
cm_c = multilabel_confusion_matrix(y_test, y_pred)
cm_c

In [None]:
cm_c = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
display(cm_c)

cm_df = pd.DataFrame(cm_c,
                     index = ['OK',  'Not recommended','Unknown'], 
                     columns = ['OK',   'Not recommended','Unknown'])
cm_df


In [None]:
label_ranking_average_precision_score(y_test, y_score)

## Model for classifying ISB

In [None]:
display(y['ISB options'].value_counts())
display(y['ISB options'])


In [None]:
# Binarize the output
y_i_b = label_binarize(y['ISB options'], classes=[10, 8, -2])
n_classes = y_i_b.shape[1]
X_train, X_test, y_train, y_test = train_test_split(X, y_i_b, test_size=0.20, random_state=12)

# ISB
classifier = OneVsRestClassifier(GaussianNB()).fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)
y_pred = classifier.predict(X_test)

fig4 = plt.figure()
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
color_i = ['green', 'limegreen',  'darkgray']
class_i = ['OK', 'Consider', 'Unknown']
linestyle = ['solid', 'solid', '--']

for i, color in zip(range(n_classes), color_i):
    plt.plot(fpr[i], tpr[i], color=color, lw=3, 
             label=class_i[i], linestyle=linestyle[i]) 
plt.plot([0, 1], [0, 1], '--', lw=1, color='lightgray')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.ylabel('True Positive Rate')

plt.xlabel('False Positive Rate')
plt.legend(loc="lower right")

plt.show()
fig4.savefig('Outputs/ROC curve isb, full, name.png', dpi = 600)

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])


# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
fig5 = plt.figure()
"""
plt.plot(
    fpr["micro"],
    tpr["micro"],
    label="micro-avg (area = {0:0.2f})".format(roc_auc["micro"]),
    color="darkgray",
    linestyle=":",
    linewidth=2,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label="macro-avg (area = {0:0.2f})".format(roc_auc["macro"]),
    color="navy",
    linestyle=":",
    linewidth=2,
)
"""
colors = cycle(['limegreen', 'blue', 'gray'])
for i, color in zip(range(n_classes), colors):
    plt.plot(
        fpr[i],
        tpr[i],
        color=color,
        lw=3,
        label="class {0} (area = {1:0.2f})".format(i, roc_auc[i]),
    )

plt.plot([0, 1], [0, 1], "--", color='lightgray')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
#plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.show()

In [None]:
cm = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
cm
cm_df = pd.DataFrame(cm,
                     index = ['OK', 'Consider',  'Unknown'], 
                     columns = ['OK', 'Consider', 'Unknown'])
cm_df

In [None]:
cm_c = multilabel_confusion_matrix(y_test, y_pred)
cm_c

## Miscell

In [None]:
classifier.get_params()

In [None]:
classifier.estimator.get_params()

## Multioutput multiclass classification

In [None]:

X = data_engineered_PLeR_modeling.drop(columns=['MCR options', 'CDU options', 'ISB options'])
y = data_engineered_PLeR_modeling[['MCR options', 'CDU options', 'ISB options']]

# Drop y with Consider class which has only 5 records
#X
y = y[y['CDU options'] != 8]
X = X.drop(['Scene 26', 'Scene 1247', 'Scene 1380', 'Scene 1655', 'Scene 2109']) # y[y['CDU options'] == 8].index.values


# Data Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=12)


In [None]:
display(y_train['CDU options'].value_counts())
display(y_test['CDU options'].value_counts())

In [None]:
# Save training and testing data
X_train.to_excel('Inputs/X_train.xlsx')
X_test.to_excel('Inputs/X_test.xlsx')
y_train.to_excel('Inputs/y_train.xlsx')
y_test.to_excel('Inputs/y_test.xlsx')

In [None]:
# modeling
model_GB_ins =  GaussianNB()

model_gnb_multioutput = MultiOutputClassifier(model_GB_ins).fit(X_train, y_train)
# https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html

#Predict
y_pred = model_gnb_multioutput.predict(X_test)
y_score = model_gnb_multioutput.predict_proba(X_test)

### Predicting for single cases

In [None]:
y_pred = model_gnb_multioutput.predict(X_train)
i= 2336

print(y_pred[i])
print(y_train.iloc[i,:])
print(X_train.iloc[i,:])
# Table 4(a) Predicting for i = 3
# [10, -10, 10]

In [None]:
X_train

In [None]:
print('----------------------------Confusion Matrix--------------')
print('MCR')
cm_c = metrics.confusion_matrix(y_test.iloc[:,0], y_pred[:,0])
display(cm_c)
print('CDU')
cm_c = metrics.confusion_matrix(y_test.iloc[:,1], y_pred[:,1])
display(cm_c)
print('ISB')
cm_c = metrics.confusion_matrix(y_test.iloc[:,2], y_pred[:,2])
display(cm_c)

print('----------------------------Classification Report--------------')
print('MCR')
print(metrics.classification_report(y_test.iloc[:,0],y_pred[:,0]))
print('CDU')
print(metrics.classification_report(y_test.iloc[:,1],y_pred[:,1]))

print('ISB')
print(metrics.classification_report(y_test.iloc[:,2],y_pred[:,2]))

In [None]:
print('----------------------------ROC AUC--------------')
print('MCR')
print(metrics.roc_auc_score(y_test.iloc[:,0],y_score[0], multi_class='ovo'))
print('CDU')
print(metrics.roc_auc_score(y_test.iloc[:,1],y_score[1], multi_class='ovo'))
print('ISB')
print(metrics.roc_auc_score(y_test.iloc[:,2],y_score[2], multi_class='ovo'))


#print(len(y_score[1]))
#print(len(y_test.iloc[:,1]))

## Saving data & model

In [None]:
# Save model
# Save the trained model as a pickle string.
# saved_model = pickle.dumps('Outputs/model_gb_multioutput.pkl')

 
# Save the model as a pickle in a file
joblib.dump(model_gnb_multioutput, 'Inputs/full_model_gnb_multioutput.pkl')

In [None]:
# Information from this Python Notebook will be used to build the Reduced Model