# Random Forest Classifier

In [10]:
# Import libraries for RandomForestClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

In [11]:
path = '/content/drive/MyDrive/Guvi Files/Final Project2-V2/'

train_tweets = pd.read_pickle(path + 'data/train_tweets_clean.pkl')
y = train_tweets['label']
X = train_tweets['lem_tweet']

In [12]:
train_tweets.head()

Unnamed: 0,label,tweet,clean_tweet,tokens,lem_tweet
0,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...,"[father, dysfunctional, selfish, drags, kids, ...",father dysfunctional selfish drag kid dysfunct...
1,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they...,"[thanks, lyft, credit, cant, use, cause, dont,...",thanks lyft credit cant use cause dont offer w...
2,0,bihday your majesty,bihday your majesty,"[bihday, majesty]",bihday majesty
3,0,#model i love u take with u all the time in ...,model i love u take with u all the time in u...,"[model, love, u, take, u, time, ur]",model love u take u time ur
4,0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, motivation]",factsguide society motivation


## Vectorize

In [13]:
# Vectorize using TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(X)

In [14]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(tfidf, y, test_size=0.2, random_state=42)

In [15]:
# Save vector as pickle for test prediction later
import pickle
pickle.dump(tfidf_vectorizer, open(path + 'data/rfc_tfidf.pkl', 'wb'))

## Compile RFC Model

In [30]:
rfc_model = RandomForestClassifier(n_estimators=500, random_state=0)
rfc_model.fit(X_train, y_train)

In [31]:
# Predict
predictions = rfc_model.predict(X_test)

## Model Evaluation

In [32]:
# get accuracy, precision, recall & F1 score
rfc_accuracy = accuracy_score(y_test, predictions)
rfc_precision = precision_score(y_test, predictions)
rfc_recall = recall_score(y_test, predictions)
rfc_f1 = f1_score(y_test, predictions)

print('Accuracy: ', rfc_accuracy)
print('Precision: ', rfc_precision)
print('Recall: ', rfc_recall)
print('F1 Score: ', rfc_f1)

Accuracy:  0.9483576024381984
Precision:  0.7191489361702128
Recall:  0.41421568627450983
F1 Score:  0.5256609642301711


In [33]:
# keep the metrics in a dict
rfc_metrics = {}
rfc_metrics['RFC - Base'] = {
    'accuracy': rfc_accuracy,
    'precision': rfc_precision,
    'recall': rfc_recall,
    'f1_score': rfc_f1
}

## Hyperparameter tuning

In [25]:
# hyperparameter tuning using GridSearchCV
rfc_param = {
    'n_estimators': [200,300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
rfc_gs = GridSearchCV(rfc_model, rfc_param, cv=5)
rfc_gs.fit(X_train, y_train)

In [26]:
# generate score with .best_score_ and hyperparemeters with .best_params_
print('Best Score: ', rfc_gs.best_score_)
print('Best Params: ', rfc_gs.best_params_)
print('Best Estimator: ', rfc_gs.best_estimator_)

Best Score:  0.9321875999623671
Best Params:  {'criterion': 'gini', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 300}
Best Estimator:  RandomForestClassifier(max_depth=8, n_estimators=300, random_state=0)


In [34]:
# Build rfc model with tuned parameters
rfc_model_tuned = RandomForestClassifier(max_depth=8, n_estimators=300, random_state=0
                                         ,criterion='gini', max_features='sqrt')
rfc_model_tuned.fit(X_train, y_train)

In [39]:
# Predict
predictions_tuned = rfc_model_tuned.predict(X_test)

In [40]:
# Evaluate model
rfc_accuracy_tuned = accuracy_score(y_test, predictions_tuned)
rfc_precision_tuned = precision_score(y_test, predictions_tuned)
rfc_recall_tuned = recall_score(y_test, predictions_tuned)
rfc_f1_tuned = f1_score(y_test, predictions_tuned)

print('Accuracy: ', rfc_accuracy_tuned)
print('Precision: ', rfc_precision_tuned)
print('Recall: ', rfc_recall_tuned)
print('F1 Score: ', rfc_f1_tuned)

Accuracy:  0.9309177108025737
Precision:  0.0
Recall:  0.0
F1 Score:  0.0


## Conclusion
1. Base model performance is poorer than SVM
2. Tuned model has precision & recall as 0, making the model useless

In [41]:
# store the metrics in dict
rfc_metrics['RFC - Tuned'] = {
    'accuracy': rfc_accuracy_tuned,
    'precision': rfc_precision_tuned,
    'recall': rfc_recall_tuned,
    'f1_score': rfc_f1_tuned
}

#Write the metrics to an excel file
with pd.ExcelWriter(path + 'models/RFC_model_metrics.xlsx') as writer:
    df = pd.DataFrame.from_dict(rfc_metrics, orient = 'index')
    df.to_excel(writer, sheet_name='RFC')