# Random Forest Model - combined genomes

## Imports

In [None]:
# Preprocessing and encoding variables
import pandas as pd
import numpy as np

# Using Skicit-learn to split data into training and testing sets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Visualising feature importance and making plots
import matplotlib.pyplot as plt
import seaborn as sns

# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# Confusion matrix
from sklearn.metrics import plot_confusion_matrix, precision_score, recall_score, roc_auc_score

In [None]:
G1 = pd.read_csv('../input/genomes-part1/G1.features.csv').iloc[:, 1:]
G2 = pd.read_csv('../input/genomes-part1/G2.features.csv').iloc[:, 1:]
G3 = pd.read_csv('../input/genomes-part1/G3.features.csv').iloc[:, 1:]
G4 = pd.read_csv('../input/genomes-part1/G4.features.csv').iloc[:, 1:]
G5 = pd.read_csv('../input/genomes-part1/G5.features.csv').iloc[:, 1:]
#G6 = pd.read_csv('../input/genomes/G6_translated.csv').iloc[:, 1:]
#G7 = pd.read_csv('../input/genomes/G7_translated.csv').iloc[:, 1:]
#G8 = pd.read_csv('../input/genomes/G8_translated.csv').iloc[:, 1:]
#G9 = pd.read_csv('../input/genomes/G9_translated.csv').iloc[:, 1:]
#G10 = pd.read_csv('../input/genomes/G10_translated.csv').iloc[:, 1:]
#G11 = pd.read_csv('../input/genomes/G11_translated.csv').iloc[:, 1:]
#G12 = pd.read_csv('../input/genomes/G12_translated.csv').iloc[:, 1:]
#G13 = pd.read_csv('../input/genomes/G13_translated.csv').iloc[:, 1:]
#G14 = pd.read_csv('../input/genomes/G14_translated.csv').iloc[:, 1:]
#G15 = pd.read_csv('../input/genomes/G15_translated.csv').iloc[:, 1:]

In [None]:
G1 = G1.sample(4000)
G2 = G2.sample(4000)
G3 = G3.sample(4000)
G4 = G4.sample(4000)
#G5 = G5.sample(4000)

In [None]:
frames = [G1, G2, G3, G4, G5]
G = pd.concat(frames)

#frames = [G1, G2, G3, G4, G5, G6, G7, G8, G9, G10]
#G = pd.concat(frames)

In [None]:
G = G.dropna()

In [None]:
G.tail(10)

In [None]:
print('The shape of our dataframe is:', G.shape)
print('Rows:', G.shape[0])
print('Columns:', G.shape[1])

## Pre-processing / Data preparation

1. One-hot encoded categorical variables
2. Split data into features and labels
3. Convert to arrays
4. Split data into training and testing sets

### Encoding target values

In [None]:
def encode_feature(array):
    """ Encode a categorical array into a number array
    
    :param array: array to be encoded
    :return: numerical array
    """
  
    encoder = preprocessing.LabelEncoder()
    encoder.fit(array)
    return encoder.transform(array)

In [None]:
class_names = ['CDS', 'LORF']
targets = G["Type"].values
print(targets)

In [None]:
targets = encode_feature(targets)
print(targets)

In [None]:
print('The shape of our dataframe is:', G.shape)
print('Rows:', G.shape[0])
print('Columns:', G.shape[1])

### Selecting features and targets and converting data to arrays

In [None]:
# Labels are the values we want to predict
labels = targets

# Remove the labels from the features -> axis 1 refers to the columns
features = G.drop(['Type','Genome', 'Dataset'], axis = 1)

# Saving feature names as list for later use
feature_names = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [None]:
print('The shape of our features are:', features.shape)

### Split into training and testing sets

In [None]:
# Split the data into training and testing sets -> x = features and y = labels/targets
train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [None]:
print('Training Features Shape:', train_x.shape)
print('Training Labels Shape:', train_y.shape)
print('Testing Features Shape:', test_x.shape)
print('Testing Labels Shape:', test_y.shape)

## Train model - best params RandomSearch

In [None]:
# Create a Gaussian Classifier
clf_rand = RandomForestClassifier(n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_features='auto', 
                             max_depth=20, bootstrap=False, random_state = 42)

# Train the model using the training sets
clf_rand.fit(train_x, train_y)

## Make predictions on test set

In [None]:
pred_y = clf_rand.predict(test_x)

# Model Accuracy, how often is the classifier correct?
print("Accuracy score:", metrics.accuracy_score(test_y, pred_y))
# Precision, Recall and Roc_AUC score
#print("Precision score:", metrics.precision_score(test_y, pred_y))
#print("Recall score:", metrics.recall_score(test_y, pred_y))
#print("ROC_AUC score:", metrics.roc_auc_score(test_y, pred_y))

## Confusion matrix

In [None]:
plot_confusion_matrix(clf_rand, test_x, test_y)  
plt.show()

#plt.savefig('confmatrix_g1_uneven.png', dpi=300, bbox_inches='tight', transparent=True)

## Finding important features

1. Create a random forests model.
2. Use the feature importance variable to see feature importance scores.
3. Visualize these scores using the seaborn library.

In [None]:
feature_imp = pd.Series(clf_rand.feature_importances_, index = feature_names).sort_values(ascending=False)

In [None]:
feature_imp

In [None]:
features = feature_imp.to_frame()
features.columns = ['Feature importance']

In [None]:
features

In [None]:
# Grouping all k-mers
dimers = features[features.index.map(lambda x: "2_mer_" in x)]
trimers = features[features.index.map(lambda x: "3_mer_" in x)]
tetramers = features[features.index.map(lambda x: "4_mer_" in x)]
pentamers = features[features.index.map(lambda x: "5_mer_" in x)] 
hexamers = features[features.index.map(lambda x: "6_mer_" in x)]

# Grouping all aa-mers
single_aa = features[features.index.map(lambda x: "1_aa_mer_" in x)]
double_aa = features[features.index.map(lambda x: "2_aa_mer_" in x)]
triple_aa = features[features.index.map(lambda x: "3_aa_mer_" in x)]

# Grouping c_weights
c_weight = features[features.index.map(lambda x: "c_weight" in x)]

#### Mean feature importances of all k-mers, aa-mers and c_weights

In [None]:
data = {'Feature importance': [round(dimers.mean().iloc[0], 6), round(trimers.mean().iloc[0], 6), 
                               round(tetramers.mean().iloc[0], 6), round(pentamers.mean().iloc[0], 6), 
                               round(hexamers.mean().iloc[0], 6), round(single_aa.mean().iloc[0], 6), 
                               round(double_aa.mean().iloc[0], 6), round(triple_aa.mean().iloc[0], 6),
                               round(c_weight.mean().iloc[0], 6)]}

features_cond = pd.DataFrame(data, index = ['dimers', 'trimers', 'tetramers', 'pentamers', 'hexamers', 
                                            'single_aa', 'double_aa', 'triple_aa', 'c_weight'])

#### Sum of all feature importances of k-mers, aa-mers and c_weights

In [None]:
data = {'Feature importance': [round(dimers.sum().iloc[0], 6), round(trimers.sum().iloc[0], 6), 
                               round(tetramers.sum().iloc[0], 6), round(pentamers.sum().iloc[0], 6), 
                               round(hexamers.sum().iloc[0], 6), round(single_aa.sum().iloc[0], 6), 
                               round(double_aa.sum().iloc[0], 6), round(triple_aa.sum().iloc[0], 6),
                               round(c_weight.sum().iloc[0], 6)]}

features_cond2 = pd.DataFrame(data, index = ['dimers', 'trimers', 'tetramers', 'pentamers', 'hexamers', 
                                            'single_aa', 'double_aa', 'triple_aa', 'c_weight'])

In [None]:
features_cond

In [None]:
ind_list = ['GC_content', 'GC1_content', 'GC2_content', 'GC3_content', 
            'Start_ATG', 'Start_GTG', 'Start_TTG', 'Length']

features_condensed = features.loc[ind_list]

In [None]:
features_condensed

In [None]:
features_red = pd.concat([features_condensed, features_cond])
features_red = features_red.squeeze()

In [None]:
features_red = features_red.sort_values(ascending=False)
features_red

In [None]:
features_red2 = pd.concat([features_condensed, features_cond2])
features_red2 = features_red2.squeeze()

In [None]:
features_red2 = features_red2.sort_values(ascending=False)
features_red2

## Generating the model on selected features

After removing the least important features the accuracy may increase. This is because one removes misleading data and noise, resulting in increased accuracy. A lesser amount of features also reduces the training time.

Extra features can decrease performance because they may “confuse” the model by giving it irrelevant data that prevents it from learning the actual relationships. The random forest performs implicit feature selection because it splits nodes on the most important variables, but other machine learning models do not. One approach to improve other models is therefore to use the random forest feature importances to reduce the number of variables in the problem. In our case, we will use the feature importances to decrease the number of features for our random forest model, because, in addition to potentially increasing performance, reducing the number of features will shorten the run time of the model. 

Often with feature reduction, there will be a minor decrease in performance that must be weighed against the decrease in run-time. Machine learning is a game of making trade-offs, and run-time versus performance is usually one of the critical decisions.

In [None]:
feature_imp = pd.Series(clf_rand.feature_importances_, index = feature_names).sort_values(ascending=False)

In [None]:
selected = feature_imp.iloc[0:300]

In [None]:
selected_feat = list(selected.index)

In [None]:
important_indices = [feature_names.index(x) for x in selected_feat]

In [None]:
# Create a Gaussian Classifier
clf_imp = RandomForestClassifier(n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_features='auto', 
                             max_depth=10, bootstrap=False, random_state = 42)

# Select most important features
important_indices = [feature_names.index(x) for x in selected_feat]

train_important = train_x[:, important_indices]
test_important = test_x[:, important_indices]

In [None]:
# Train the random forest
clf_imp.fit(train_important, train_y)

In [None]:
# Make predictions and determine the error
pred_y = clf_imp.predict(test_important)

# Model Accuracy, how often is the classifier correct?
print("Accuracy score:", metrics.accuracy_score(test_y, pred_y))
# Precision, Recall and Roc_AUC score
print("Precision score:", metrics.precision_score(test_y, pred_y))
print("Recall score:", metrics.recall_score(test_y, pred_y))
#print("ROC_AUC score:", metrics.roc_auc_score(test_y, pred_y))

#### Confusion matrix

In [None]:
plot_confusion_matrix(clf_imp, test_important, test_y)  
plt.show()

plt.savefig('confmatrix_g5.png', dpi=300, bbox_inches='tight', transparent=True)

After feature selection based on the most important features the metrics improved by 2%. In the confusion matrix we see that the clf_imp model generates much less false positives and slightly more false negatives. Meaning the accuracy, precision and ROC_AUC scores improve after feature selection and recall decreases slightly. 