# Random Forest Model - Feature Selection

BEST MODEL G1 (high default score):
- Random Search with CV - BUT
- n_estimators': 200
- n_features: 200

BEST MODEL G2 (low default score):
- Random search with CV
- n_estimators': 1000
- max_features': 'auto'
- max_depth': 10
- bootstrap': False

## Import packages and choose data

In [None]:
# Preprocessing and encoding variables
import pandas as pd
import numpy as np
from random import sample
from time import time

# Using Skicit-learn to split data into training and testing sets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Visualising feature importance and making plots
import matplotlib.pyplot as plt
import seaborn as sns

# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# Confusion matrix
from sklearn.metrics import plot_confusion_matrix, precision_score, recall_score, roc_auc_score

# Selecting best features
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel

## Single genome

In [None]:
G = pd.read_csv('../input/genomespart2/G15.features.csv').iloc[:, 1:]

## Pre-processing / Data preparation

1. One-hot encoded categorical variables
2. Split data into features and labels
3. Convert to arrays
4. Split data into training and testing sets

In [None]:
print('The shape of our dataframe is:', G.shape)

In [None]:
G = G.dropna()

In [None]:
print('The shape of our dataframe is:', G.shape)

### Encoding target values

In [None]:
def encode_feature(array):
    """ Encode a categorical array into a number array
    
    :param array: array to be encoded
    :return: numerical array
    """
  
    encoder = preprocessing.LabelEncoder()
    encoder.fit(array)
    return encoder.transform(array)

In [None]:
class_names = ['CDS', 'LORF']
targets = G["Type"].values
print(targets)

In [None]:
targets = encode_feature(targets)
print(targets)

In [None]:
print('The shape of our dataframe is:', G.shape)
print('Rows:', G.shape[0])
print('Columns:', G.shape[1])

### Selecting features and targets and converting data to arrays

In [None]:
# Labels are the values we want to predict
labels = targets

# Remove the labels from the features -> axis 1 refers to the columns
features = G.drop(['Type','Genome', 'Dataset'], axis = 1)

# Saving feature names as list for later use
feature_names = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [None]:
print('The shape of our features are:', features.shape)

### Split into training and testing sets

In [None]:
# Split the data into training and testing sets -> x = features and y = labels/targets
train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [None]:
print('Training Features Shape:', train_x.shape)
print('Training Labels Shape:', train_y.shape)
print('Testing Features Shape:', test_x.shape)
print('Testing Labels Shape:', test_y.shape)

### Selecting features based on feature importance

In [None]:
feature_names = np.array(feature_names)
feature_names

In [None]:
tic = time()
sel = SelectFromModel(RandomForestClassifier(n_estimators=200, max_features='auto', 
                             max_depth=10, bootstrap=False, random_state = 42))
sel.fit(train_x, train_y)

toc = time()
print(f"Done in {toc - tic:.3f}s")

In [None]:
train_imp = sel.transform(train_x)
test_imp = sel.transform(test_x)

#### Fit new model with only selected features

In [None]:
# Create a Gaussian Classifier
clf_imp = RandomForestClassifier(n_estimators=200, max_features='auto', 
                             max_depth=10, bootstrap=False, random_state = 42)

In [None]:
# Train the random forest with only selected features
clf_imp.fit(train_imp, train_y)

#### Make prediction with new model

In [None]:
# Make predictions and determine the error
pred_y = clf_imp.predict(test_imp)

# Model Accuracy, how often is the classifier correct?
print("Accuracy score:", metrics.accuracy_score(test_y, pred_y))

# Precision, Recall and Roc_AUC score
print("Precision score:", metrics.precision_score(test_y, pred_y))
print("Recall score:", metrics.recall_score(test_y, pred_y))
print("ROC_AUC score:", metrics.roc_auc_score(test_y, pred_y))

In [None]:
selected_feat = pd.DataFrame(sel.get_support())
selected_feat['Feat_name'] = feature_names
selected_feat = selected_feat.rename(columns={0:'Bool'})
selected_feat

In [None]:
important_feat = selected_feat.loc[selected_feat['Bool'] == True]
important_feat