# Fake News Document Classification
### Baseline Methods

In [1]:
import nltk
import spacy
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, f1_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('./raw_data/fulltrain.csv', header=None)
data.columns = ['cls', 'text']
data.head()

Unnamed: 0,cls,text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [4]:
cls_names = { 0 : "satire", 1 : "hoax", 2 : "propaganda", 3 : "reliable"}
data['cls'] = data['cls'] - 1
data['cls'].map(cls_names).value_counts()

hoax          17870
propaganda     9995
satire         6942
Name: cls, dtype: int64

In [5]:
test_data = pd.read_csv('./raw_data/balancedtest.csv', header=None)
test_data.columns = ['cls', 'text']
test_data['cls'] = test_data['cls'] - 1
test_data['cls'].map(cls_names).value_counts()

satire        750
hoax          750
propaganda    750
reliable      750
Name: cls, dtype: int64

In [5]:
def train(model, pca=False):
    X_train, X_val, y_train, y_val = train_test_split(data['text'], data['cls'], test_size=0.2, random_state=42)

    tfidf = TfidfVectorizer(
                ngram_range=(1, 2),
                stop_words=stopwords.words('english'),
                max_df=0.8,
                min_df=10,
                max_features=5096
                )

    X_train = tfidf.fit_transform(X_train).toarray()
    X_val = tfidf.transform(X_val).toarray()

    if pca:
        svd = TruncatedSVD(n_components=32)
        X_train = svd.fit_transform(X_train)
        X_test = svd.transform(X_test)

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    print('Validation Performance\n')
    print(classification_report(y_val, y_val_pred))
    
    x_test = tfidf.transform(test_data['text']).toarray()
    x_test = pca.transform(x_test) if pca else x_test
    
    y_pred = model.predict(x_test)
    
    print('Test Performance\n')
    print(classification_report(test_data['cls'], y_pred))
    
    print('Test Set Micro F1 Score')
    print(f1_score(test_data['cls'], y_pred, average='micro'))
    
    print('\n')
    print(classification_report(test_data['cls'], y_pred, output_dict=True))

Baseline - Logistic Regression

In [38]:
# DummyClassifier(strategy='stratified')
# GaussianNB()
# LogisticRegression()
# XGBClassifier()

train(LogisticRegression())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Performance

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      2793
           1       0.97      0.93      0.95      1371
           2       0.95      0.98      0.97      3587
           3       0.96      0.92      0.94      2020

    accuracy                           0.95      9771
   macro avg       0.96      0.95      0.95      9771
weighted avg       0.95      0.95      0.95      9771

Test Performance

              precision    recall  f1-score   support

           0       0.84      0.75      0.79       750
           1       0.80      0.41      0.55       750
           2       0.57      0.84      0.68       750
           3       0.80      0.89      0.84       750

    accuracy                           0.72      3000
   macro avg       0.75      0.72      0.72      3000
weighted avg       0.75      0.72      0.72      3000

Test Set Micro F1 Score
0.7243333333333333


{'0': {'precision': 0.8397626112759644, 're

In [7]:
train(XGBClassifier())

Validation Performance

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      2793
           1       0.96      0.94      0.95      1371
           2       0.95      0.98      0.97      3587
           3       0.96      0.91      0.94      2020

    accuracy                           0.95      9771
   macro avg       0.95      0.95      0.95      9771
weighted avg       0.95      0.95      0.95      9771

Test Performance

              precision    recall  f1-score   support

           0       0.76      0.67      0.72       750
           1       0.65      0.33      0.44       750
           2       0.55      0.75      0.63       750
           3       0.71      0.89      0.79       750

    accuracy                           0.66      3000
   macro avg       0.67      0.66      0.64      3000
weighted avg       0.67      0.66      0.64      3000

Test Set Micro F1 Score
0.66


{'0': {'precision': 0.7628398791540786, 'recall': 0.67333

In [1]:
from transformers import AutoModel

model = AutoModel.from_pretrained('xlnet-base-cased')



  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# for name, param in model.named_parameters():
#     if name.startswith('layer.11.ff'):
#         param.requires_grad = False

for n in range(3):
    print(model.layer[-(n+1)])

XLNetLayer(
  (rel_attn): XLNetRelativeAttention(
    (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ff): XLNetFeedForward(
    (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (layer_1): Linear(in_features=768, out_features=3072, bias=True)
    (layer_2): Linear(in_features=3072, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (activation_function): GELUActivation()
  )
  (dropout): Dropout(p=0.1, inplace=False)
)
XLNetLayer(
  (rel_attn): XLNetRelativeAttention(
    (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ff): XLNetFeedForward(
    (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (layer_1): Linear(in_features=768, out_features=3072, bias=True)
    (layer_2): Linear(in_features=3072, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
   

In [7]:
bert = AutoModel.from_pretrained('bert-base-uncased')
for name, param in bert.named_parameters():
    print(name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key

In [8]:
match = [
    'encoder.layer.11', 'encoder.rel_embeddings.weight', 'encoder.LayerNorm', 'pooler'
]

debert = AutoModel.from_pretrained('microsoft/deberta-v3-base')
for name, param in debert.named_parameters():
    if any(name.startswith(m) for m in match):
        print(name)
        continue



Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


encoder.layer.11.attention.self.query_proj.weight
encoder.layer.11.attention.self.query_proj.bias
encoder.layer.11.attention.self.key_proj.weight
encoder.layer.11.attention.self.key_proj.bias
encoder.layer.11.attention.self.value_proj.weight
encoder.layer.11.attention.self.value_proj.bias
encoder.layer.11.attention.output.dense.weight
encoder.layer.11.attention.output.dense.bias
encoder.layer.11.attention.output.LayerNorm.weight
encoder.layer.11.attention.output.LayerNorm.bias
encoder.layer.11.intermediate.dense.weight
encoder.layer.11.intermediate.dense.bias
encoder.layer.11.output.dense.weight
encoder.layer.11.output.dense.bias
encoder.layer.11.output.LayerNorm.weight
encoder.layer.11.output.LayerNorm.bias
encoder.rel_embeddings.weight
encoder.LayerNorm.weight
encoder.LayerNorm.bias
