In [None]:
import pickle
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from mycolorpy import colorlist as mcp
import pandas as pd
from sklearn import model_selection
import numpy as np

In [None]:
with open('mlp_results.pkl', 'rb') as f:
    mlp_results = pickle.load(f)
with open('rf_results.pkl', 'rb') as f:
    rf_results = pickle.load(f)
with open('lr_results.pkl', 'rb') as f:
    lr_results = pickle.load(f)
with open('gn_results.pkl', 'rb') as f:
    gn_results = pickle.load(f)

In [None]:
# Plot AUROC curve
colors = mcp.gen_color(cmap = 'Set3', n = 4)
plt.figure(figsize = (10, 10))
plt.plot(lr_results['fpr'], lr_results['tpr'], label = "Logistic Regression, AUC = "+str(round(lr_results['roc_auc'], 3)), color = colors[0], linewidth = 3)
plt.plot(gn_results['fpr'], gn_results['tpr'], label = "Gaussian Naive Bayes, AUC = "+str(round(gn_results['roc_auc'], 3)), color = colors[1], linewidth = 3)
plt.plot(rf_results['fpr'], rf_results['tpr'], label = "Random Forest Classifier, AUC = "+str(round(rf_results['roc_auc'], 3)), color = colors[2], linewidth = 3)
plt.plot(mlp_results['fpr'], mlp_results['tpr'], label = "Multi-layer Perceptron, AUC = "+str(round(mlp_results['roc_auc'], 3)), color = colors[3], linewidth = 3)
plt.legend(loc = 'lower right', fontsize = 15)

In [None]:
# Barplot (Accuracy, F1-score)
fig, ax = plt.subplots(1, 2, figsize = (20, 10))
acc = [lr_results['acc'], gn_results['acc'], rf_results['acc'], mlp_results['acc']]
f1 = [lr_results['f1'][0], gn_results['f1'][0], rf_results['f1'][0], mlp_results['f1'][0]]
x = ['Logistic Regression', 'Gaussian Naive Bayes', 'Random Forest Classifier', 'Multi-layer Perceptron']
ax[0].bar(x, acc, color = colors)
ax[0].xaxis.set_tick_params(labelsize = 10)
for i in range(len(acc)):
        ax[0].text(i, acc[i] + 0.005, round(acc[i], 3), ha = 'center', fontsize = 13)
ax[0].set_ylim(0, 1)
ax[0].set_xlabel('Model', fontsize = 15)
ax[0].set_ylabel('Accuracy', fontsize = 15)

ax[1].bar(x, f1, color = colors)
ax[1].xaxis.set_tick_params(labelsize = 10)
for i in range(len(acc)):
        ax[1].text(i, f1[i] + 0.005, round(f1[i], 3), ha = 'center', fontsize = 13)
ax[1].set_ylim(0, 1)
ax[1].set_xlabel('Model', fontsize = 15)
ax[1].set_ylabel('F1 score', fontsize = 15)

plt.show()

In [None]:
total_df = pd.read_csv('total_sequences.csv', index_col = 0)

In [None]:
# Split train and test
total_train, total_test = model_selection.train_test_split(total_df, test_size = 0.3, stratify = total_df['label'], random_state = 1)

In [None]:
# Split data and label
X_train, y_train = total_train.iloc[:, :-1], total_train.iloc[:, -1]
X_test, y_test = total_test.iloc[:, :-1], total_test.iloc[:, -1]

In [None]:
# Split data and label
X_train, y_train = total_train.iloc[:, :-1], total_train.iloc[:, -1]
X_test, y_test = total_test.iloc[:, :-1], total_test.iloc[:, -1]

In [None]:
# Feature importance of Random forest
rf = RandomForestClassifier(random_state=1, n_jobs = 5, **rf_results['cv_results']['params'][44])
rf.fit(X_train, y_train)
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis = 0)

In [None]:
# Plot feature importance
feature_names = range(-10, 11)
forest_importances = pd.Series(importances, index=feature_names)
fig, ax = plt.subplots(figsize = (10, 7))
forest_importances.plot.bar(yerr=std, ax=ax)

for i in range(len(feature_names)):
        ax.text(i, forest_importances[feature_names[i]] + std[i] + 0.005, round(forest_importances[feature_names[i]], 2), ha = 'center', fontsize = 13)
    
ax.set_title("Feature importances using MDI", fontsize = 15)
ax.set_xlabel("Distance from Crosslinked base", fontsize = 15)
ax.set_ylabel("Mean decrease in impurity", fontsize = 15)
fig.tight_layout()