### Install Required Packages

In [1]:
! pip install numpy pandas scikit-learn wordcloud matplotlib



### Imports

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

### Read data

In [3]:
train_df = pd.read_csv('../assist_material/datasets/extracted/datasets2020/datasets/q1/train_small.csv', sep=',')
train_df.columns = ['id', 'title', 'content', 'label']

### Benchmark Models

For the benchmarking we use the following combinations. SVM with TF-IDF, Random Forest with TF-IDF & SVM with SVD,
Random Forest with SVD.


In [4]:
vectorizer = TfidfVectorizer()
svd = TruncatedSVD(n_components=300)

svm = SVC(kernel='linear')
random_forest = RandomForestClassifier(n_estimators=1000, max_features='sqrt')

svm_tfidf = make_pipeline(vectorizer,svm)
random_forest_tfidf = make_pipeline(vectorizer, random_forest)

svm_tfidf_svd = make_pipeline(vectorizer, svd, svm)
random_forest_tfidf_svd = make_pipeline(vectorizer, svd, random_forest)


### Initialize data with labels in order to seed the classifiers

In [5]:
X = train_df['title'] + ' ' + train_df['content']
y = train_df['label']

### SVM with TF-IDF

In [6]:
scores_svm_tfidf = cross_validate(svm_tfidf, X, y,
                        scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
                        cv=5,
                        n_jobs=-1,
                        return_train_score=False)
print('SVM + tfidf', scores_svm_tfidf)

### Random Forest with TF-IDF

In [7]:
scores_random_forest_tfidf = cross_validate(random_forest_tfidf, X, y,
                        scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
                        cv=5,
                        n_jobs=-1,
                        return_train_score=False)
print('Random Forest + tfidf', scores_random_forest_tfidf)

### SVM with SVD

In [8]:
scores_svm_tfidf_svd = cross_validate(svm_tfidf_svd, X, y,
                        scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
                        cv=5,
                        n_jobs=-1,
                        return_train_score=False)
print('SVM + tfidf + SVD', scores_svm_tfidf_svd)

### Random Forest with SVD

In [9]:
scores_random_forest_tfidf_svd = cross_validate(random_forest_tfidf_svd, X, y,
                        scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
                        cv=5,
                        n_jobs=-1,
                        return_train_score=False)
print('Random Forest + tfidf + SVD', scores_random_forest_tfidf_svd)

Random Forest + tfidf + SVD {'fit_time': array([1480.78302121, 1501.0908289 , 1498.23283124, 1493.84269404,
       1516.39122272]), 'score_time': array([8.45600486, 8.6007936 , 8.73060846, 8.27172494, 7.3048346 ]), 'test_accuracy': array([0.94411426, 0.93843697, 0.93825956, 0.94118169, 0.93656849]), 'test_precision_macro': array([0.94663006, 0.93985527, 0.94171644, 0.94517633, 0.93867251]), 'test_recall_macro': array([0.93017714, 0.9219483 , 0.92018643, 0.92458141, 0.92278172]), 'test_f1_macro': array([0.93791101, 0.9302755 , 0.93008724, 0.93408672, 0.93026265])}


## Beat the Benchmark classifier

In order to achieve the best performance in terms of accuracy and execution time the best choice is Random Forest with
SVD. Tuning this model further we can easily achieve 96% accuracy. The hyper-parameters below are occurred through the
tuning phase which was a time-consuming process, approximately 20 different combinations were executed. Also, some
information about the preprocessing. The input text initially gets cleaned up from stopwords, turned to lower case
and finally vectorized to TF-IDF.

In [None]:
stop_words = ENGLISH_STOP_WORDS.union(['will', 's', 't', 'one', 'new', 'said', 'say', 'says'])

vectorizer_tuned = TfidfVectorizer(lowercase=True, stop_words=stop_words, ngram_range=(1,1), max_features=45000)
svd_tuned = TruncatedSVD(n_components=1000)

random_forest_tuned = RandomForestClassifier(n_estimators=1000, max_features='sqrt', n_jobs=-1)

random_forest_tfidf_svd_tuned = make_pipeline(vectorizer_tuned, svd_tuned, random_forest_tuned)

scores_random_forest_tfidf_svd_tuned = cross_validate(random_forest_tfidf_svd_tuned, X, y,
                        scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
                        cv=5,
                        return_train_score=False)
print('Random Forest + tfidf + SVD', scores_random_forest_tfidf_svd_tuned)

### Generating stats table

In [10]:
data_table = [[np.mean(scores_svm_tfidf['test_accuracy'], dtype='float64'),
               np.mean(scores_random_forest_tfidf['test_accuracy'], dtype='float64'),
               np.mean(scores_svm_tfidf_svd['test_accuracy'], dtype='float64'),
               np.mean(scores_random_forest_tfidf_svd['test_accuracy'], dtype='float64'),
               np.mean(scores_random_forest_tfidf_svd_tuned['test_accuracy'], dtype='float64')],
              [np.mean(scores_svm_tfidf['test_precision_macro'], dtype='float64'),
               np.mean(scores_random_forest_tfidf['test_precision_macro'], dtype='float64'),
               np.mean(scores_svm_tfidf_svd['test_precision_macro'], dtype='float64'),
               np.mean(scores_random_forest_tfidf_svd['test_precision_macro'], dtype='float64'),
               np.mean(scores_random_forest_tfidf_svd_tuned['test_precision_macro'], dtype='float64')],
              [np.mean(scores_svm_tfidf['test_recall_macro'], dtype='float64'),
               np.mean(scores_random_forest_tfidf['test_recall_macro'], dtype='float64'),
               np.mean(scores_svm_tfidf_svd['test_recall_macro'], dtype='float64'),
               np.mean(scores_random_forest_tfidf_svd['test_recall_macro'], dtype='float64'),
               np.mean(scores_random_forest_tfidf_svd_tuned['test_recall_macro'], dtype='float64')],
              [np.mean(scores_svm_tfidf['test_f1_macro'], dtype='float64'),
               np.mean(scores_random_forest_tfidf['test_f1_macro'], dtype='float64'),
               np.mean(scores_svm_tfidf_svd['test_f1_macro'], dtype='float64'),
               np.mean(scores_random_forest_tfidf_svd['test_f1_macro'], dtype='float64'),
               np.mean(scores_random_forest_tfidf_svd_tuned['test_f1_macro'], dtype='float64')]
              ]

cell_text = []
for row in data_table:
    cell_text.append([f'{x:1.5f}' for x in row])

plt.figure(dpi=150)
ax = plt.gca()
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
plt.box(on=None)
plt.subplots_adjust(left=0.2, bottom=0.2)

the_table = plt.table(cellText=cell_text,
                      rowLabels=['Accuracy', 'Precision', 'Recall', 'F1-Score'],
                      colLabels=['SVM (BoW)', 'Random Forest (BoW)', 'SVM (SVD)', 'Random Forest (SVD)', 'My Method'],
                      colColours=['lightsteelblue'] * 5,
                      rowColours=['lightsteelblue'] * 4,
                      loc='center')
the_table.scale(1, 1.5)
fig = plt.gcf()
plt.show()