# Classification w/ Ensemble Models

In [138]:
import numpy as np
import pandas as pd
import pickle
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

from clean_eng_funcs import preprocess_for_bow
from topic_model import topics_in_doc

In [5]:
# Get LDA Mallet model
file = open('mallet.p', 'rb')
mallet_model = pickle.load(file)
file.close()

# Get DataFrame of articles
file = open('df4cluster.p', 'rb')
df = pickle.load(file)
file.close()

## Multiclass RandomForest Classification
- Multiclass classification predicts slightly better than simply predicting the dominant class

In [12]:
# Only use article metrics
X = df.drop(columns=['headline', 'body', 'url', 'date', 'source'])
y = df['source'].astype('category')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [15]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [104]:
rf = RandomForestClassifier()
params = {
    'n_estimators': [50, 75, 100, 125, 150, 200],
    'criterion': ['gini'],
    'max_depth': [4, 5, 6],
    'class_weight': ['balanced', 'balanced_subsample', None],
}

gs_rf = GridSearchCV(
    estimator=rf,
    param_grid=params,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    cv=5,
)

In [105]:
gs_rf.fit(X_train, y_train)
rf_model = gs_rf.best_estimator_
print(30*'-')
print('Best params:')
print(gs_rf.best_params_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:   18.7s finished


------------------------------
Best params:
{'class_weight': None, 'criterion': 'gini', 'max_depth': 4, 'n_estimators': 150}


In [106]:
def get_scores(model, test_data, test_labels):
    if len(test_data) != len(test_labels):
        return 'Data shapes incorrect'

    preds = model.predict(test_data)
    _f1 = round(f1_score(test_labels, preds, average='macro'), 4)
    _acc = round(accuracy_score(test_labels, preds), 4)
    _pre = round(precision_score(test_labels, preds, average='macro'), 4)
    _rec = round(recall_score(test_labels, preds, average='macro'), 4)
    print('F1 Score:', _f1)
    print('Accuracy:', _acc)
    print('Precision:', _pre)
    print('Recall:', _rec)
    print('--------------')
    print(confusion_matrix(test_labels, preds))

In [107]:
get_scores(clf, X_test, y_test)
preds = clf.predict(X_test)

F1 Score: 0.402
Accuracy: 0.6125
Precision: 0.4321
Recall: 0.4125
--------------
[[ 6  3  0 18]
 [ 0 16  0 11]
 [ 0  3  0 12]
 [ 5  8  2 76]]


In [108]:
get_scores(gs_rf, X_test, y_test)
gs_preds = gs_rf.predict(X_test)

F1 Score: 0.4316
Accuracy: 0.6562
Precision: 0.5364
Recall: 0.4317
--------------
[[ 6  1  0 20]
 [ 0 16  0 11]
 [ 0  1  0 14]
 [ 1  7  0 83]]



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [109]:
rf_df = pd.DataFrame()
gs_df = pd.DataFrame()

rf_df['true_label'] = y_test
rf_df['pred_label'] = preds
rf_df['correct'] = np.where(rf_df['true_label'] == rf_df['pred_label'], 1, 0)
gs_df['true_label'] = y_test
gs_df['pred_label'] = gs_preds
gs_df['correct'] = np.where(gs_df['true_label'] == gs_df['pred_label'], 1, 0)

In [137]:
fig = px.density_heatmap(gs_df, x='true_label', y='pred_label')
fig.update_layout(title=dict(text='RF w/ GridSearchCV'))
fig.show()

fig = px.density_heatmap(rf_df, x='true_label', y='pred_label')
fig.update_layout(title=dict(text='Standard RF'))
fig.show()

In [145]:
y_cctv = np.where(df['source'] == 'CCTV', 1, 0)

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, y_cctv, test_size=0.2, random_state=123)

In [147]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [148]:
get_scores(clf, X_test, y_test)
preds = clf.predict(X_test)

F1 Score: 0.7631
Accuracy: 0.8812
Precision: 0.8036
Recall: 0.7367
--------------
[[127   6]
 [ 13  14]]


### Notes:
- RandomForest w/ GridSearchCV vs Standard RandomForest:
    - Predicted 0 CNN articles
    - Reduced Reuters articles misclassified as ABC
    - Overall both fairly poor
- Binary Classifier:
    - CCTV vs. Not CCTV performed worse than multiclassification with respect to CCTV

## One vs. Rest Classifier
- Performance approximately the same as RF, RF w/GS

In [142]:
ovr = OneVsRestClassifier(SVC()).fit(X_train, y_train)
get_scores(ovr, X_test, y_test)

F1 Score: 0.4173
Accuracy: 0.6375
Precision: 0.5446
Recall: 0.4235
--------------
[[ 6  1  0 20]
 [ 0 16  0 11]
 [ 0  2  0 13]
 [ 0 11  0 80]]



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

