# Fake News Document Classification
### Baseline Methods

In [13]:
import nltk
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [7]:
train_data = pd.read_csv('../data/augmented_train.csv', header=None)
train_data.columns = ['cls', 'text']

cls_names = { 0 : "satire", 1 : "hoax", 2 : "propaganda", 3 : "reliable"}
train_data['cls'] = train_data['cls'] - 1

train_data['cls'].map(cls_names).value_counts()

propaganda    26182
satire        22454
reliable      15926
hoax          11094
Name: cls, dtype: int64

In [6]:
val_data = pd.read_csv('../data/validation.csv', header=None)
val_data.columns = ['cls', 'text']

val_data['cls'] = val_data['cls'] - 1

val_data['cls'].map(cls_names).value_counts()

propaganda    3273
satire        2806
reliable      1991
hoax          1387
Name: cls, dtype: int64

In [8]:
test_data = pd.read_csv('../data/balancedtest.csv', header=None)
test_data.columns = ['cls', 'text']
test_data['cls'] = test_data['cls'] - 1
test_data['cls'].map(cls_names).value_counts()

satire        750
hoax          750
propaganda    750
reliable      750
Name: cls, dtype: int64

In [14]:
def train(model, pca=False):
    
    X_train, y_train = train_data['text'], train_data['cls']
    X_val, y_val = val_data['text'], val_data['cls']

    tfidf = TfidfVectorizer(
                ngram_range=(1, 2),
                stop_words=stopwords.words('english'),
                max_df=0.8,
                min_df=10,
                max_features=5096
                )

    X_train = tfidf.fit_transform(X_train).toarray()
    X_val = tfidf.transform(X_val).toarray()

    if pca:
        svd = TruncatedSVD(n_components=32)
        X_train = svd.fit_transform(X_train)
        X_test = svd.transform(X_test)

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    print('Validation Performance\n')
    print(classification_report(y_val, y_val_pred))
    
    x_test = tfidf.transform(test_data['text']).toarray()
    x_test = pca.transform(x_test) if pca else x_test
    
    y_pred = model.predict(x_test)
    
    print('Test Performance\n')
    print(classification_report(test_data['cls'], y_pred))
    
    print('Test Set Micro F1 Score')
    print(f1_score(test_data['cls'], y_pred, average='micro'))
    
    print('Test Set Accuracy Score')
    print(accuracy_score(test_data['cls'], y_pred))
    
    print('\n')
    print(classification_report(test_data['cls'], y_pred, output_dict=True))

Baseline - Logistic Regression

In [15]:
train(LogisticRegression())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Performance

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      2806
           1       0.97      0.96      0.97      1387
           2       0.98      0.98      0.98      3273
           3       0.96      0.94      0.95      1991

    accuracy                           0.97      9457
   macro avg       0.97      0.96      0.96      9457
weighted avg       0.97      0.97      0.97      9457

Test Performance

              precision    recall  f1-score   support

           0       0.84      0.76      0.80       750
           1       0.79      0.44      0.57       750
           2       0.60      0.82      0.69       750
           3       0.78      0.91      0.84       750

    accuracy                           0.73      3000
   macro avg       0.75      0.73      0.72      3000
weighted avg       0.75      0.73      0.72      3000

Test Set Micro F1 Score
0.732
Test Set Accuracy Score
0.732


{'0': {'precision': 0.8370

In [7]:
train(XGBClassifier())

Validation Performance

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      2793
           1       0.96      0.94      0.95      1371
           2       0.95      0.98      0.97      3587
           3       0.96      0.91      0.94      2020

    accuracy                           0.95      9771
   macro avg       0.95      0.95      0.95      9771
weighted avg       0.95      0.95      0.95      9771

Test Performance

              precision    recall  f1-score   support

           0       0.76      0.67      0.72       750
           1       0.65      0.33      0.44       750
           2       0.55      0.75      0.63       750
           3       0.71      0.89      0.79       750

    accuracy                           0.66      3000
   macro avg       0.67      0.66      0.64      3000
weighted avg       0.67      0.66      0.64      3000

Test Set Micro F1 Score
0.66


{'0': {'precision': 0.7628398791540786, 'recall': 0.67333