# Plots of RF, XGBoost and RNN results

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

In [None]:
genomes = ['M. tuberculosis', 'S. aureus', 'S. enterica', 'L. monocytogenes', 'C. jejuni', 'S. flexneri', 
           'P. aeruginosa', 'B. subtilis', 'C. burnetii', 'C. trachomatis', 'E. coli str. Sakai', 
           'E. coli str. K-12', 'C. vibrioides', 'A. pittii', 'K. pneumoniae']

## Random Forets Results

### Dataset 1 vs Dataset 2 (w/ default RF-model)

In [None]:
accuracy_default_rf = {'Dataset 2': [0.9417, 0.9939, 0.7705, 0.8000, 0.7688, 0.9505, 0.7852, 0.9524, 
                                            0.9312, 0.9458, 0.9758, 0.9572, 0.9616, 0.9756, 0.7803], 
                       'Dataset 1': [0.9587, 0.9939, 0.7584, 0.8039, 0.7396, 0.9312, 0.7836, 0.9480, 0.9498, 0.9443, 
                                          0.9818, 0.9684, 0.9723, 0.9752, 0.7621]}
accuracy_default_rf = pd.DataFrame(data=accuracy_default_rf, index=genomes)

In [None]:
# Multiple line plots
plt.subplots(figsize=(15, 5))

plt.plot(accuracy_default_rf.index, 'Dataset 1', data=accuracy_default_rf, color='orange', linewidth=4)
plt.plot(accuracy_default_rf.index, 'Dataset 2', data=accuracy_default_rf, color='red', linewidth=4)

plt.xlabel('Genome', fontsize=25)
plt.ylabel('Accuracy score', fontsize=25, labelpad=20)
plt.xticks(fontsize=25, rotation=90, style='italic')
plt.yticks(fontsize=25)
plt.legend(loc=4,fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)

plt.show()

In [None]:
plt.subplots(figsize=(15, 5))

x_axis = np.arange(len(genomes))
x_ticks = np.arange(0,15)
y_ticks =  [0.75, 0.8, 0.85, 0.9, 0.95, 1.0]

D1 = [0.9417, 0.9939, 0.7705, 0.8000, 0.7688, 0.9505, 0.7852, 0.9524, 
                                            0.9312, 0.9458, 0.9758, 0.9572, 0.9616, 0.9756, 0.7803]
D2 = [0.9587, 0.9939, 0.7584, 0.8039, 0.7396, 0.9312, 0.7836, 0.9480, 0.9498, 0.9443, 
                                          0.9818, 0.9684, 0.9723, 0.9752, 0.7621]
  
plt.bar(x_axis - 0.2, D1, 0.4, label = 'Dataset 1', color='orange')
plt.bar(x_axis + 0.2, D2, 0.4, label = 'Dataset 2', color='red')

plt.xlabel('Genome', fontsize=25)
plt.ylabel('Accuracy score', fontsize=25, labelpad=20)
plt.xticks(x_ticks, genomes, fontsize=25, rotation=40, ha='right', style='italic')
plt.yticks(y_ticks, fontsize=25)
plt.legend(bbox_to_anchor=(1.04,1), loc="upper left", fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)
plt.ylim([0.70, 1])

plt.show()

In [None]:
x_axis = np.arange(len(genomes))
x_ticks = np.arange(0,15)
y_ticks =  [0.75, 0.8, 0.85, 0.9, 0.95, 1.0]

D1 = np.sort(np.array([0.9417, 0.9939, 0.7705, 0.8000, 0.7688, 0.9505, 0.7852, 0.9524, 
                                            0.9312, 0.9458, 0.9758, 0.9572, 0.9616, 0.9756, 0.7803]), axis=-1)
D2 = np.sort(np.array([0.9587, 0.9939, 0.7584, 0.8039, 0.7396, 0.9312, 0.7836, 0.9480, 0.9498, 0.9443, 
                                          0.9818, 0.9684, 0.9723, 0.9752, 0.7621]), axis=-1)

print(x_axis)

In [None]:
fig, ax = plt.subplots(figsize=(18.5, 11))
opt1 = ax.barh(y=x_axis - 0.2, width=D1, height=0.29, label = 'Dataset 1', color='orange')
opt2 = ax.barh(y=x_axis + 0.2, width=D2, height=0.29, label = 'Dataset 2', color='red')

ax.set_xlabel('Genome', fontsize=20)
ax.set_ylabel('Accuracy score', fontsize=20, labelpad=20)
plt.yticks(x_ticks, genomes, fontsize=20, style='italic')
plt.xticks(y_ticks, fontsize=20)
plt.legend(fontsize=15)
plt.grid(linestyle = '--', linewidth = 0.5)
plt.xlim([0.70, 1.02])

ax.bar_label(opt1, padding=2, fontsize=13)
ax.bar_label(opt2, padding=2, fontsize=13)

plt.show()

In [None]:
plt.bar([1,2,3,4,5,6,7,8,9,10],[2,1,4,2,1,4,2,1,4,5])

plt.xticks([1,2,3,4,5,6,7,8,9,10],['L. sakei','G. salaris','C. major','C. jejuni','G. salaris','C. major','L. sakei','G. salaris','C. major','L. snipen'], style="italic", rotation=45, ha='right')

plt.show()


In [None]:

plt.barh([1,2,3,4,5,6,7,8,9,10],[2,1,4,2,1,4,2,1,4,5])

plt.yticks([1,2,3,4,5,6,7,8,9,10],['L. sakei','G. salaris','C. major','C. jejuni','G. salaris','C. major','L. sakei','G. salaris','C. major','L. snipen'], style="italic", rotation=0)

plt.show()

### Dataset 1 vs Dataset 2 - Accuracy, Precision, Recall (w/tuned RF model)

In [None]:
d2_rf_tuned = {'Accuracy': [0.9634, 0.9957, 0.7979, 0.8431, 0.7775, 0.9640, 0.8236, 0.9661, 0.9424, 
                                           0.9559, 0.9789, 0.9700, 0.9795, 0.9833, 0.8104], 
                     'Precision': [0.9512, 0.9936, 0.8278, 0.8355, 0.7686, 0.9558, 0.8390, 
                                   0.9508, 0.9395, 0.9394, 0.9663, 0.9530, 0.9721, 0.9761, 0.8375],
                    'Recall': [0.9768, 0.9979, 0.7733, 0.8482, 0.7921, 0.9730, 0.8076, 0.9843, 0.9306, 
                               0.9794, 0.9929, 0.9902, 0.9879, 0.9919, 0.7782]}
d2_rf_tuned = pd.DataFrame(data=d2_rf_tuned, index=genomes)

d1_rf_tuned = {'Accuracy': [0.9681, 0.9957, 0.7754, 0.8297, 0.7470, 0.9638, 0.8096, 0.9688, 
                                  0.9519, 0.9574, 0.9824, 0.9822, 0.9740, 0.9871, 0.7877], 
                     'Precision': [0.9659, 0.9936, 0.7148, 0.7429, 0.6514, 0.9574, 0.7567, 0.9407, 0.9559, 0.9478, 
                                   0.9732, 0.9648, 0.9576, 0.9737, 0.7809],
                    'Recall': [0.9340, 0.9979, 0.6475, 0.6987, 0.5112, 0.8491, 0.6657, 0.9432, 0.7647, 0.8898, 
                               0.9913, 0.9705, 0.9818, 0.9737, 0.6643]}
d1_rf_tuned = pd.DataFrame(data=d1_rf_tuned, index=genomes)

In [None]:
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(15, 8))

ax1.plot(d1_rf_tuned.index, 'Accuracy', data=d1_rf_tuned, marker='', markerfacecolor='blue', linewidth=2)
ax1.plot(d1_rf_tuned.index, 'Precision', data=d1_rf_tuned, marker='', color='olive', linewidth=2)
ax1.plot(d1_rf_tuned.index, 'Recall', data=d1_rf_tuned, marker='', color='orange', linewidth=2)

y = [0.5,0.6,0.7,0.8,0.9,1]
ax1.set_xticks(np.linspace(0,14,15))
ax1.set_xticklabels([])
ax1.set_yticks(y, fontsize=25)
ax1.set_yticklabels(y, fontsize=25)
ax1.set_ylabel('Dataset 1', fontsize=25, labelpad=20)
ax1.legend(bbox_to_anchor=(1.04,1), loc="upper left", fontsize=20)
ax1.grid(linestyle = '--', linewidth = 0.5)

ax2.plot(d2_rf_tuned.index, 'Accuracy', data=d2_rf_tuned, marker='', markerfacecolor='blue', linewidth=2)
ax2.plot(d2_rf_tuned.index, 'Precision', data=d2_rf_tuned, marker='', color='olive', linewidth=2)
ax2.plot(d2_rf_tuned.index, 'Recall', data=d2_rf_tuned, marker='', color='orange', linewidth=2)

ax2.set_xticks(np.linspace(0,14,15))
ax2.set_xticklabels(d2_rf_tuned.index, fontsize=25, rotation=90, style='italic')
ax2.set_xlabel('Genome', fontsize=25)
ax2.set_yticks(y,fontsize=25)
ax2.set_yticklabels(y, fontsize=25)
ax2.set_ylabel('Dataset 2', fontsize=25, labelpad=20)
ax2.legend(bbox_to_anchor=(1.04,1), loc="upper left", fontsize=20)
ax2.grid(linestyle = '--', linewidth = 0.5)

fig.savefig('RF_comp_metrics_opt1.png', dpi=300, bbox_inches='tight')

fig.show()

In [None]:
# Multiple line plots
plt.subplots(figsize=(15, 5))

plt.plot(d1_rf_tuned.index, 'Accuracy', data=d1_rf_tuned, marker='', markerfacecolor='blue', linewidth=2)
plt.plot(d1_rf_tuned.index, 'Precision', data=d1_rf_tuned, marker='', color='olive', linewidth=2)
plt.plot(d1_rf_tuned.index, 'Recall', data=d1_rf_tuned, marker='', color='orange', linewidth=2)

plt.title('Dataset 1', fontsize=25)
plt.xlabel('Genome', fontsize=25)
plt.ylabel('Accuracy score', fontsize=25, labelpad=25)
plt.xticks(fontsize=25, rotation=90, style='italic')
plt.yticks(fontsize=20)
plt.legend(loc=4, fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)
#plt.rcParams["font.family"] = "Times New Roman"

plt.show()

In [None]:
# Multiple line plots
plt.subplots(figsize=(15, 5))

plt.plot(d2_rf_tuned.index, 'Accuracy', data=d2_rf_tuned, marker='', markerfacecolor='blue', linewidth=2)
plt.plot(d2_rf_tuned.index, 'Precision', data=d2_rf_tuned, marker='', color='olive', linewidth=2)
plt.plot(d2_rf_tuned.index, 'Recall', data=d2_rf_tuned, marker='', color='orange', linewidth=2)

plt.title('Dataset 2', fontsize=25)
plt.xlabel('Genome', fontsize=25)
plt.ylabel('Accuracy score', fontsize=25, labelpad=25)
plt.xticks(fontsize=25, rotation=90, style='italic')
plt.yticks(fontsize=20)
plt.legend(loc=4, fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)
#plt.rcParams["font.family"] = "Times New Roman"

plt.show()

### Feature selection - manually

In [None]:
accuracy = {'Accuracy': [0.9602, 0.9635, 0.9635, 0.9598, 0.9616, 0.9565, 0.9588, 0.9598, 0.9574, 0.9548, 0.9565], 
           'Precision': [0.9509, 0.9512, 0.9512, 0.9451, 0.9461, 0.9377, 0.9411, 0.9428, 0.9393, 0.9336, 0.9346],
           'Recall': [0.9703, 0.9768, 0.9768, 0.9759, 0.9786, 0.9777, 0.9786, 0.9786, 0.9777, 0.9786, 0.9814]}
num_features = [100,200,300,400,500,600,700,800,1000,1200,1400]
acc_feature = pd.DataFrame(data=accuracy, index=num_features)

acc_feature

In [None]:
# Multiple line plots
plt.subplots(figsize=(15, 7))

plt.plot(acc_feature.index, 'Accuracy', data=acc_feature, marker='', markerfacecolor='blue', linewidth=4)
plt.plot(acc_feature.index, 'Precision', data=acc_feature, marker='', color='olive', linewidth=4)
plt.plot(acc_feature.index, 'Recall', data=acc_feature, marker='', color='orange', linewidth=4)

plt.xlabel('Number of features', fontsize=25, labelpad=20)
plt.ylabel('Evaluation metrics', fontsize=25, labelpad=20)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
plt.legend(fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)
#plt.rcParams["font.family"] = "Times New Roman"

plt.savefig('RF_manual_feat_select.png', dpi=300, bbox_inches='tight')

plt.show()

### Manual feature selection vs Scikit-Learn feature selection

In [None]:
acc_feat_select = {'Manual': [0.9634, 0.9957, 0.8210, 0.8431, 0.7819, 0.9640, 0.8550, 0.9661, 0.9424, 
                              0.9559, 0.9789, 0.9700, 0.9795, 0.9833, 0.8501], 
                       'Scikit-learn': [0.9463, 0.9928, 0.7585, 0.8070, 0.7655, 0.9482, 0.7801, 0.9529, 
                                    0.9382, 0.9436, 0.9753, 0.9606, 0.9657, 0.9833, 0.7803]}
acc_feat_select = pd.DataFrame(data=acc_feat_select, index=genomes)

acc_feat_select

In [None]:
# Multiple line plots
plt.subplots(figsize=(15, 5))

plt.plot(acc_feat_select.index, 'Manual', data=acc_feat_select, marker='', color='pink', linewidth=4)
plt.plot(acc_feat_select.index, 'Scikit-learn', data=acc_feat_select, marker='', color='purple', linewidth=4)

#plt.title('Default vs. Tuned Random Forest Classifier', fontsize=25)
plt.xlabel('Genome', fontsize=25)
plt.ylabel('Accuracy score', fontsize=25, labelpad=20)
plt.xticks(fontsize=25, rotation=90, style='italic')
plt.yticks(fontsize=25)
plt.legend(loc=4,fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)
#plt.rcParams["font.family"] = "Times New Roman"

plt.savefig('RF_manual_scikitlearn.png', dpi=300, bbox_inches='tight')

plt.show()

### All features vs Selected Features (w/ tuned RF-model and feature importance)

In [None]:
accuracy_tuned_rf = {'All features': [0.9472, 0.9950, 0.7890, 0.8110, 0.7872, 0.9527, 0.8211, 0.9529, 0.9410, 
                                      0.9501, 0.9768, 0.9600, 0.9683, 0.9791, 0.8211], 
                     'Selected features': [0.9634, 0.9957, 0.8275, 0.8431, 0.7819, 0.9640, 0.8550, 0.9661, 
                                           0.9424, 0.9559, 0.9789, 0.9700, 0.9795, 0.9833, 0.8501]}
accuracy_tuned_rf = pd.DataFrame(data=accuracy_tuned_rf, index=genomes)

In [None]:
# Multiple line plots
plt.subplots(figsize=(15, 5))

plt.plot(accuracy_tuned_rf.index, 'All features', data=accuracy_tuned_rf, marker='', color='red', linewidth=4)
plt.plot(accuracy_tuned_rf.index, 'Selected features', data=accuracy_tuned_rf, marker='', color='green', linewidth=4)

#plt.title('Default vs. Tuned Random Forest Classifier', fontsize=25)
plt.xlabel('Genome', fontsize=25)
plt.ylabel('Accuracy score', fontsize=25, labelpad=20)
plt.xticks(fontsize=25, rotation=90, style='italic')
plt.yticks(fontsize=25)
plt.legend(loc=4,fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)

plt.savefig('RF_all_feat_vs_selected.png', dpi=300, bbox_inches='tight')

plt.show()

### Tuned RF model with and without the feature "Length"

In [None]:
acc_tuned_length = {'With length': [0.9472, 0.9950, 0.7890, 0.8110, 0.7872, 0.9527, 0.8211, 
                                          0.9529, 0.9410, 0.9501, 0.9768, 0.9600, 0.9683, 0.9791, 0.8211], 
                       'Without length': [0.9463, 0.9928, 0.7585, 0.8070, 0.7655, 0.9482, 0.7801, 
                                    0.9529, 0.9382, 0.9436, 0.9753, 0.9606, 0.9657, 0.9833, 0.7803]}
acc_tuned_length = pd.DataFrame(data=acc_tuned_length, index=genomes)

acc_tuned_length

In [None]:
# Multiple line plots
plt.subplots(figsize=(15, 5))

plt.plot(acc_tuned_length.index, 'With length', data=acc_tuned_length, marker='', markerfacecolor='blue', linewidth=4)
plt.plot(acc_tuned_length.index, 'Without length', data=acc_tuned_length, marker='', color='olive', linewidth=4)

plt.xlabel('Genome', fontsize=25)
plt.ylabel('Accuracy score', fontsize=25, labelpad=20)
plt.xticks(fontsize=25, rotation=90, style='italic')
plt.yticks(fontsize=25)
plt.legend(loc=4,fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)

plt.savefig('RF_without_length.png', dpi=300, bbox_inches='tight')

plt.show()

### Default vs tuned before feature selection

In [None]:
accuracy_rf = {'Default': [0.9417, 0.9939, 0.7705, 0.8000, 0.7688, 0.9505, 0.7852, 0.9524, 
                                            0.9312, 0.9458, 0.9758, 0.9572, 0.9616, 0.9756, 0.7803],
               'Tuned': [0.9472, 0.9950, 0.7890, 0.8110, 0.7872, 0.9527, 0.8211, 0.9529, 0.9410, 
                         0.9501, 0.9768, 0.9600, 0.9683, 0.9791, 0.8211]}
accuracy_rf = pd.DataFrame(data=accuracy_rf, index=genomes)

In [None]:
# Multiple line plots
plt.subplots(figsize=(15, 5))

plt.plot(accuracy_rf.index, 'Default', data=accuracy_rf, marker='', color='purple', linewidth=4)
plt.plot(accuracy_rf.index, 'Tuned', data=accuracy_rf, marker='', color='orange', linewidth=4)

#plt.title('Default vs. Tuned Random Forest Classifier', fontsize=25)
plt.xlabel('Genome', fontsize=25)
plt.ylabel('Accuracy score', fontsize=25, labelpad=20)
plt.xticks(fontsize=25, rotation=90, style='italic')
plt.yticks(fontsize=25)
plt.legend(loc=4,fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)

plt.savefig('RF_default_tuned.png', dpi=300, bbox_inches='tight')

plt.show()

### Default vs tuned w/ selected features (best RF model)

In [None]:
accuracy_rf_best = {'Default': [0.9417, 0.9939, 0.7705, 0.8000, 0.7688, 0.9505, 0.7852, 0.9524, 
                                            0.9312, 0.9458, 0.9758, 0.9572, 0.9616, 0.9756, 0.7803],
               'Tuned': [0.9634, 0.9957, 0.8275, 0.8431, 0.7819, 0.9640, 0.8550, 0.9661, 0.9424, 
                         0.9559, 0.9789, 0.9700, 0.9795, 0.9833, 0.8501]}
accuracy_rf_best = pd.DataFrame(data=accuracy_rf_best, index=genomes)

In [None]:
# Multiple line plots
plt.subplots(figsize=(15, 5))

plt.plot(accuracy_rf_best.index, 'Default', data=accuracy_rf_best, marker='', color='purple', linewidth=4)
plt.plot(accuracy_rf_best.index, 'Tuned', data=accuracy_rf_best, marker='', markerfacecolor='blue', linewidth=4)

#plt.title('Default vs. Tuned Random Forest Classifier', fontsize=25)
plt.xlabel('Genome', fontsize=25)
plt.ylabel('Accuracy score', fontsize=25, labelpad=20)
plt.xticks(fontsize=25, rotation=90, style='italic')
plt.yticks(fontsize=25)
plt.legend(loc=4,fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)

plt.savefig('RF_default_best.png', dpi=300, bbox_inches='tight')

plt.show()

## XGBoost results

### Deafult RF vs. default XGB model

In [None]:
accuracy_default = {'Random Forest': [0.9417, 0.9939, 0.7705, 0.8000, 0.7688, 0.9505, 0.7852, 0.9524, 
                                      0.9312, 0.9458, 0.9758, 0.9572, 0.9616, 0.9756, 0.7803], 
                       'XGBoost': [0.9648, 0.9932, 0.8006, 0.8382, 0.7895, 0.9775, 0.8224, 0.9783, 
                                   0.9564, 0.9653, 0.9810, 0.9822, 0.9806, 0.9923, 0.8083]}
accuracy_default = pd.DataFrame(data=accuracy_default, index=genomes)

In [None]:
# Multiple line plots
plt.subplots(figsize=(15, 5))

plt.plot(accuracy_default.index, 'Random Forest', data=accuracy_default, marker='', markerfacecolor='blue', linewidth=4)
plt.plot(accuracy_default.index, 'XGBoost', data=accuracy_default, marker='', color='pink', linewidth=4)

#plt.title('Default vs. Tuned Random Forest Classifier', fontsize=25)
plt.xlabel('Genome', fontsize=25)
plt.ylabel('Accuracy score', fontsize=25, labelpad=20)
plt.xticks(fontsize=25, rotation=90, style='italic')
plt.yticks(fontsize=25)
plt.legend(loc=4,fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)

plt.savefig('Default_rf_vs_xgb.png', dpi=300, bbox_inches='tight')

plt.show()

### All features vs 200 selected  best features (w/ tuned RF-model and feature importance)

In [None]:
accuracy_selected_xgb = {'All features': [0.9648, 0.9932, 0.8006, 0.8382, 0.7895, 0.9775, 0.8224, 0.9783, 
                                   0.9564, 0.9653, 0.9810, 0.9822, 0.9806, 0.9923, 0.8083], 
                       'Selected features': [0.9653, 0.9939, 0.7867, 0.8400, 0.7917, 0.9730, 0.8181, 
                                             0.9769, 0.9537, 0.9675, 0.9799, 0.9833, 0.9785, 0.9909, 0.7965]}
accuracy_selected_xgb = pd.DataFrame(data=accuracy_selected_xgb, index=genomes)

In [None]:
# Multiple line plots
plt.subplots(figsize=(15, 5))

plt.plot(accuracy_selected_xgb.index, 'All features', data=accuracy_selected_xgb, marker='', color='blue', linewidth=4)
plt.plot(accuracy_selected_xgb.index, 'Selected features', data=accuracy_selected_xgb, marker='', color='red', linewidth=4)

#plt.title('Default vs. Tuned Random Forest Classifier', fontsize=25)
plt.xlabel('Genome', fontsize=25)
plt.ylabel('Accuracy score', fontsize=25, labelpad=20)
plt.xticks(fontsize=25, rotation=90, style='italic')
plt.yticks(fontsize=25)
plt.legend(loc=4,fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)

plt.savefig('all_vs_sel_xgb.png', dpi=300, bbox_inches='tight')

plt.show()

## RNN results

### Dataset 1 vs. Dataset 2

In [None]:
acc_best_model = {'Dataset 2': [0.9405, 0.9646, 0.8157, 0.8852, 0.8229, 0.9466, 0.8507, 0.9599, 
                                0.9333, 0.9341, 0.9505, 0.9493, 0.9296, 0.9790, 0.8240], 
                       'Dataset 1': [0.9225, 0.9686, 0.7977, 0.8691, 0.7286, 0.9095, 0.8351, 
                                     0.9584, 0.9295, 0.9154, 0.9579, 0.9453, 0.9299, 0.9703, 0.8179]}
acc_best_model = pd.DataFrame(data=acc_best_model, index=genomes)

acc_best_model

In [None]:
# Multiple line plots
plt.subplots(figsize=(15, 5))

plt.plot(acc_best_model.index, 'Dataset 1', data=acc_best_model, marker='', color='olive', linewidth=4)
plt.plot(acc_best_model.index, 'Dataset 2', data=acc_best_model, marker='', color='orange', linewidth=4)

#plt.title('Default vs. Tuned Random Forest Classifier', fontsize=25)
plt.xlabel('Genome', fontsize=25)
plt.ylabel('Accuracy score', fontsize=25, labelpad=20)
plt.xticks(fontsize=25, rotation=90, style='italic')
plt.yticks(fontsize=25)
plt.legend(loc=4,fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)

plt.savefig('RNN_d1_vs_d2.png', dpi=300, bbox_inches='tight')

plt.show()

### Random Forest vs. XGBoost vs. RNN - best model

In [None]:
comparing_all_models = {'Random Forest': [0.9634, 0.9957, 0.7979, 0.8725, 0.7819, 0.9640, 0.8236, 0.9661, 0.9424, 
                         0.9559, 0.9789, 0.9700, 0.9795, 0.9833, 0.8104], 
                     'XGBoost': [0.9648, 0.9932, 0.8006, 0.8382, 0.7895, 0.9775, 0.8224, 0.9783, 
                                   0.9564, 0.9653, 0.9810, 0.9822, 0.9806, 0.9923, 0.8083],
                    'RNN': [0.9405, 0.9646, 0.8157, 0.8852, 0.8229, 0.9466, 0.8507, 0.9599, 
                                0.9333, 0.9341, 0.9505, 0.9493, 0.9296, 0.9790, 0.8240]}
comparing_all_models = pd.DataFrame(data=comparing_all_models, index=genomes)

In [None]:
plt.subplots(figsize=(15, 5))

plt.plot(comparing_all_models.index, 'Random Forest', data=comparing_all_models, marker='', color='orange', linewidth=4)
plt.plot(comparing_all_models.index, 'XGBoost', data=comparing_all_models, marker='', color='olive', linewidth=4)
plt.plot(comparing_all_models.index, 'RNN', data=comparing_all_models, marker='', color='purple', linewidth=4)

#plt.title('Dataset 2', fontsize=25)
plt.xlabel('Genome', fontsize=25)
plt.ylabel('Accuracy score', fontsize=25, labelpad=25)
plt.xticks(fontsize=25, rotation=90, style='italic')
plt.yticks(fontsize=25)
plt.legend(loc=4, fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)
#plt.rcParams["font.family"] = "Times New Roman"


plt.savefig('Comparing_all_models.png', dpi=300, bbox_inches='tight')

plt.show()

### Combined genomes

In [None]:
x_labels = ['10 genomes', '5 best', '5 genomes', '5 worst']
randf = [0.9164, 0.9741,0.9064, 0.8509]
xgb =  [0.9212, 0.9809, 0.8863, 0.8298]
rnn =  [0.8921, 0.9547,0.8780, 0.8340]

randf_g = [0.9095, 0.9776,0.8823, 0.8315]
xgb_g =  [0.9086, 0.9845, 0.8773, 0.8118]
rnn_g =  [0.9054, 0.9601, 0.8858, 0.8397]

In [None]:
x = np.arange(len(x_labels))  # the label locations
width = 0.05 # the width of the bars

fig, ax = plt.subplots(figsize=(15, 5))
rects1 = ax.bar(x - width*6, randf, width, label='Random Forest', color='orange', edgecolor='orange', linewidth=2)
rects1_1 = ax.bar(x - width*4, randf_g, width, label='Random Forest Mean', color='white', edgecolor='orange', linewidth=2)
rects2 = ax.bar(x - width, xgb, width, label='XGBoost', color='olive', edgecolor='olive', linewidth=2)
rects2_1 = ax.bar(x + width, xgb_g, width, label='XGBoost Mean', color='white', edgecolor='olive', linewidth=2)
rects3 = ax.bar(x + width*4, rnn, width, label='RNN', color='purple', edgecolor='purple', linewidth=2)
rects3_1 = ax.bar(x + width*6, rnn_g, width, label='RNN Mean', color='white', edgecolor='purple', linewidth=2)

y = [0.8, 0.85, 0.90, 0.95, 1.0]
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Accuracy score', fontsize=25, labelpad=20)
ax.set_xticks(x, x_labels,  fontsize=25)
ax.set_yticks(y,fontsize=25)
ax.set_yticklabels(y, fontsize=25)
plt.legend(bbox_to_anchor=(1.04,1), loc="upper left", fontsize=20)
plt.grid(linestyle = '--', linewidth = 0.5)
plt.ylim([0.80, 1])

fig.tight_layout()

plt.savefig('comp_multiple_gen.png', dpi=300, bbox_inches='tight')
plt.show()