# Random Forest Model - Feature Selection

BEST MODEL G1 (high default score):
- Random Search with CV - BUT
- n_estimators': 200
- n_features: 200

BEST MODEL G2 (low default score):
- Random search with CV
- n_estimators': 1000
- max_features': 'auto'
- max_depth': 10
- bootstrap': False

## Import packages and choose data

In [1]:
# Preprocessing and encoding variables
import pandas as pd
import numpy as np
from random import sample
from time import time

# Using Skicit-learn to split data into training and testing sets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Visualising feature importance and making plots
import matplotlib.pyplot as plt
import seaborn as sns

# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# Confusion matrix
from sklearn.metrics import plot_confusion_matrix, precision_score, recall_score, roc_auc_score

# Selecting best features
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SequentialFeatureSelector

## Single genome

In [2]:
G = pd.read_csv('~/Documents/NMBU/Semester 12/Data Science Master/data-science-thesis/data_expo/G1.features.csv').iloc[:, 1:]

## Pre-processing / Data preparation

1. One-hot encoded categorical variables
2. Split data into features and labels
3. Convert to arrays
4. Split data into training and testing sets

In [3]:
print('The shape of our dataframe is:', G.shape)

The shape of our dataframe is: (8649, 14791)


In [4]:
G = G.dropna()

In [5]:
print('The shape of our dataframe is:', G.shape)

The shape of our dataframe is: (8648, 14791)


### Encoding target values

In [6]:
def encode_feature(array):
    """ Encode a categorical array into a number array
    
    :param array: array to be encoded
    :return: numerical array
    """
  
    encoder = preprocessing.LabelEncoder()
    encoder.fit(array)
    return encoder.transform(array)

In [7]:
class_names = ['CDS', 'LORF']
targets = G["Type"].values
print(targets)

['CDS' 'CDS' 'CDS' ... 'CDS' 'CDS' 'CDS']


In [8]:
targets = encode_feature(targets)
print(targets)

[0 0 0 ... 0 0 0]


In [9]:
print('The shape of our dataframe is:', G.shape)
print('Rows:', G.shape[0])
print('Columns:', G.shape[1])

The shape of our dataframe is: (8648, 14791)
Rows: 8648
Columns: 14791


### Selecting features and targets and converting data to arrays

In [10]:
# Labels are the values we want to predict
labels = targets

# Remove the labels from the features -> axis 1 refers to the columns
features = G.drop(['Type','Genome', 'Dataset'], axis = 1)

# Saving feature names as list for later use
feature_names = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [11]:
print('The shape of our features are:', features.shape)

The shape of our features are: (8648, 14788)


### Split into training and testing sets

In [12]:
# Split the data into training and testing sets -> x = features and y = labels/targets
train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [13]:
print('Training Features Shape:', train_x.shape)
print('Training Labels Shape:', train_y.shape)
print('Testing Features Shape:', test_x.shape)
print('Testing Labels Shape:', test_y.shape)

Training Features Shape: (6486, 14788)
Training Labels Shape: (6486,)
Testing Features Shape: (2162, 14788)
Testing Labels Shape: (2162,)


### Selecting features based on feature importance

In [14]:
train_x.shape

(6486, 14788)

In [15]:
test_x.shape

(2162, 14788)

In [16]:
feature_names = np.array(feature_names)
feature_names

array(['Length', 'GC_content', 'GC1_content', ..., 'c_weight_TTC',
       'c_weight_TTG', 'c_weight_TTT'], dtype='<U12')

In [17]:
tic = time()
sel = SelectFromModel(RandomForestClassifier(n_estimators=200, max_features='auto', 
                             max_depth=10, bootstrap=False, random_state = 42))
sel.fit(train_x, train_y)

toc = time()
print(f"Features selected by SelectFromModel: {feature_names[sel.get_support()]}")
print(f"Done in {toc - tic:.3f}s")

Features selected by SelectFromModel: ['Length' 'GC_content' 'GC1_content' ... 'c_weight_TTC' 'c_weight_TTG'
 'c_weight_TTT']
Done in 69.573s


In [18]:
train_imp = sel.transform(train_x)
test_imp = sel.transform(test_x)

In [19]:
train_imp.shape

(6486, 1181)

In [20]:
test_imp.shape

(2162, 1181)

#### Fit new model with only selected features

In [29]:
# Create a Gaussian Classifier
clf_imp = RandomForestClassifier(n_estimators=200, max_features='auto', 
                             max_depth=10, bootstrap=False, random_state = 42)

In [30]:
# Train the random forest with only selected features
clf_imp.fit(train_imp, train_y)

RandomForestClassifier(bootstrap=False, max_depth=20, n_estimators=200,
                       random_state=42)

#### Make prdiction with new model

In [31]:
# Make predictions and determine the error
pred_y = clf_imp.predict(test_imp)

# Model Accuracy, how often is the classifier correct?
print("Accuracy score:", metrics.accuracy_score(test_y, pred_y))

# Precision, Recall and Roc_AUC score
print("Precision score:", metrics.precision_score(test_y, pred_y))
print("Recall score:", metrics.recall_score(test_y, pred_y))
print("ROC_AUC score:", metrics.roc_auc_score(test_y, pred_y))

Accuracy score: 0.956059204440333
Precision score: 0.9376114081996435
Recall score: 0.9767873723305478
ROC_AUC score: 0.956135621649145


In [24]:
selected_feat = pd.DataFrame(sel.get_support())
selected_feat['Feat_name'] = feature_names

In [25]:
selected_feat = selected_feat.rename(columns={0:'Bool'})

In [26]:
selected_feat

Unnamed: 0,Bool,Feat_name
0,True,Length
1,True,GC_content
2,True,GC1_content
3,True,GC2_content
4,True,GC3_content
...,...,...
14783,True,c_weight_TGT
14784,True,c_weight_TTA
14785,True,c_weight_TTC
14786,True,c_weight_TTG


In [27]:
important_feat = selected_feat.loc[selected_feat['Bool'] == True]

In [28]:
important_feat

Unnamed: 0,Bool,Feat_name
0,True,Length
1,True,GC_content
2,True,GC1_content
3,True,GC2_content
4,True,GC3_content
...,...,...
14783,True,c_weight_TGT
14784,True,c_weight_TTA
14785,True,c_weight_TTC
14786,True,c_weight_TTG


### Selecting features with Sequential Feature Selection

In [34]:
clf = RandomForestClassifier(n_estimators=200, max_features='auto', 
                             max_depth=10, bootstrap=False, random_state = 42)

In [35]:
tic_fwd = time()
sfs_forward = SequentialFeatureSelector(
    clf, n_features_to_select=50, direction="forward"
).fit(train_x, train_y)
toc_fwd = time()

tic_bwd = time()
sfs_backward = SequentialFeatureSelector(
    clf, n_features_to_select=50, direction="backward"
).fit(train_x, train_y)
toc_bwd = time()

print(
    "Features selected by forward sequential selection: "
    f"{feature_names[sfs_forward.get_support()]}"
)
print(f"Done in {toc_fwd - tic_fwd:.3f}s")
print(
    "Features selected by backward sequential selection: "
    f"{feature_names[sfs_backward.get_support()]}"
)
print(f"Done in {toc_bwd - tic_bwd:.3f}s")

KeyboardInterrupt: 

In [None]:
train_imp_forward = sfs_forward.transform(train_x)
test_imp_forward = sel_forward.transform(test_x)

In [None]:
train_imp_backward = sfs_backward.transform(train_x)
test_imp_backward = sel_backward.transform(test_x)

#### Train new model on forward selected features

In [None]:
# Create a Gaussian Classifier
clf_imp = RandomForestClassifier(n_estimators=200, max_features='auto', 
                             max_depth=10, bootstrap=False, random_state = 42)

# Train the random forest with only selected features
clf_imp.fit(train_imp_forward, train_y)

#### Train new model on backward selected features

In [None]:
#Train the random forest with only selected features
clf_imp.fit(train_imp_backward, train_y)

#### Make prediction for forward selected feature model

In [None]:
# Make predictions and determine the error
pred_y = clf_imp.predict(test_imp_forward)

# Model Accuracy, how often is the classifier correct?
print("Accuracy score:", metrics.accuracy_score(test_y, pred_y))

# Precision, Recall and Roc_AUC score
print("Precision score:", metrics.precision_score(test_y, pred_y))
print("Recall score:", metrics.recall_score(test_y, pred_y))
print("ROC_AUC score:", metrics.roc_auc_score(test_y, pred_y))

#### Make prediction for forward selected feature model

In [None]:
# Make predictions and determine the error
pred_y = clf_imp.predict(test_imp_forward)

# Model Accuracy, how often is the classifier correct?
print("Accuracy score:", metrics.accuracy_score(test_y, pred_y))

# Precision, Recall and Roc_AUC score
print("Precision score:", metrics.precision_score(test_y, pred_y))
print("Recall score:", metrics.recall_score(test_y, pred_y))
print("ROC_AUC score:", metrics.roc_auc_score(test_y, pred_y))