# Regression analysis - Sharechat annotated claims dataset

Research - https://arxiv.org/abs/2010.13387

In [156]:
import sys
sys.path.append("/working-files")
import os
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import confusion_matrix
import numpy as np
import re
import string
from pprint import pprint
import json
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import random
import pandas as pd
import logging
from datetime import datetime, timedelta
import time
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
pd.options.mode.chained_assignment = None
import xgboost
from xgboost import XGBClassifier, DMatrix
from xgboost import to_graphviz
from sklearn.tree import export_graphviz
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import joblib

## Load data

In [27]:
df = pd.read_csv("working-files/sharechat_annotated_transformed.csv", index_col=None)

In [28]:
# Drop columns that won't be required
drop_cols = [
        "tag_name", "tag_translation", "timestamp", 
        "caption", "text", "filename",
        "datetime", "extracted_text", "named_entities", "combined_text"]

## Preprocessing

In [104]:
# Train test split
X = df.drop("claim", axis=1)
y = df["claim"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [105]:
def prepare_data(df):
    # Drop columns that won't be used 
    df.drop(drop_cols, axis=1, inplace=True)
    return df

In [106]:
X_train = prepare_data(X_train)

In [107]:
# Standardize numeric variables
scaler = StandardScaler()
num = ["n_words", "n_hashtags", "likes", "external_shares", "entities_count"]
num_cols = pd.DataFrame(scaler.fit_transform(X_train[num]))
num_cols.columns = num

In [108]:
# Convert categorical columns
cat = [
    "media_type", "contains_video", "contains_image", 
    "contains_relevant_meme", "bucket_name", "hour"]
X_train[cat] = X_train[cat].apply(pd.Categorical)

In [109]:
# Dummify categorical columns
cat_cols = pd.get_dummies(X_train[cat], drop_first = True)

In [110]:
X_train = pd.concat([cat_cols, num_cols.set_index(cat_cols.index)], axis=1)

In [111]:
# Encode target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

In [112]:
# Apply preprocessing to test set
X_test = prepare_data(X_test)

num_cols = pd.DataFrame(scaler.transform(X_test[num]))
num_cols.columns = num

X_test[cat] = X_test[cat].apply(pd.Categorical)

cat_cols = pd.get_dummies(X_test[cat], drop_first = True)
X_test = pd.concat([cat_cols, num_cols.set_index(cat_cols.index)], axis=1)

y_test = label_encoder.transform(y_test)

In [113]:
# Defining a function to fix column difference between train and test sets

# This will go inside the main function
def add_missing_dummy_columns(test_set, train_columns):
    # d = test set, columns = train set columns 
    missing_cols = set(train_columns) - set(test_set.columns)
    for c in missing_cols:
        test_set[c] = 0 # add missing columns to test set with empty column values
        
# This is the main function     
def fix_columns(test_set, train_columns):  

    add_missing_dummy_columns(test_set, train_columns)

    # make sure we have all the columns we need
    assert(set(train_columns) - set(test_set.columns) == set())

    extra_cols = set(test_set.columns) - set(train_columns) # these are the extra cols in the test set
    if extra_cols:
        print ("extra columns:", extra_cols)

    test_set = test_set[train_columns] # keep only columns that are in the train set 
    return test_set


In [114]:
X_test = fix_columns(X_test, X_train.columns)

extra columns: {'bucket_name_राजनीति '}


In [115]:
# Sanity check
assert(set(X_train.columns) - set(X_test.columns) == set())

## Model building

In [119]:
clf = XGBClassifier(random_state = 0)
clf.fit(X_train, y_train)
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test) 
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.9852272727272727
0.7613636363636364


In [195]:
xgb = XGBClassifier(random_state = 0)
params = {
    "max_depth": list(range(2,9)),
    "min_child_weight": list(range(1,11)),
    "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "eta": [0.05, 0.1, 0.2, 0.3],
    "gamma": list(range(0, 11))
    
}
clf = RandomizedSearchCV(estimator=xgb, param_distributions=params, n_iter=999, scoring="accuracy", n_jobs=-1, random_state=0)

In [196]:
clf.fit(X_train, y_train)

RandomizedSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100, n_job...
                                           subsample=None, tree_method=None,
                                           validate_parameters=None,
                                           verbosity=N

In [198]:
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test) 
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.8886363636363637
0.7272727272727273


In [199]:
joblib.dump(clf, 'working-files/randomsearch-xgb.pkl')
joblib.dump(clf, 'working-files/randomsearch-xgb.joblib')
joblib.dump(clf.best_estimator_, 'working-files/randomsearch-xgb-best.pkl')
joblib.dump(clf.best_estimator_, 'working-files/randomsearch-xgb-best.joblib')

['working-files/randomsearch-xgb-best.joblib']

In [200]:
# clf = joblib.load('working-files/gridsearch-xgb.pkl')

In [201]:
dtrain = DMatrix(X_train, y_train)
dtest = DMatrix(X_test, y_test)

In [202]:
# Default parameters
params = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    "eval_metric": "auc",
}

In [130]:
clf = xgboost.train(params=params, dtrain=dtrain, num_boost_round=999, evals = [(dtest, "Test")], early_stopping_rounds=10)

[0]	Test-auc:0.73007
Will train until Test-auc hasn't improved in 10 rounds.
[1]	Test-auc:0.75932
[2]	Test-auc:0.77986
[3]	Test-auc:0.78844
[4]	Test-auc:0.79142
[5]	Test-auc:0.78563
[6]	Test-auc:0.78745
[7]	Test-auc:0.78921
[8]	Test-auc:0.79183
[9]	Test-auc:0.79345
[10]	Test-auc:0.79427
[11]	Test-auc:0.79322
[12]	Test-auc:0.79210
[13]	Test-auc:0.79379
[14]	Test-auc:0.79321
[15]	Test-auc:0.79346
[16]	Test-auc:0.79707
[17]	Test-auc:0.80260
[18]	Test-auc:0.80469
[19]	Test-auc:0.80458
[20]	Test-auc:0.80360
[21]	Test-auc:0.80550
[22]	Test-auc:0.80490
[23]	Test-auc:0.80663
[24]	Test-auc:0.80170
[25]	Test-auc:0.80091
[26]	Test-auc:0.80273
[27]	Test-auc:0.80070
[28]	Test-auc:0.80036
[29]	Test-auc:0.80162
[30]	Test-auc:0.80409
[31]	Test-auc:0.80484
[32]	Test-auc:0.80561
[33]	Test-auc:0.80663
Stopping. Best iteration:
[23]	Test-auc:0.80663



In [131]:
cv_results = xgboost.cv(
    params,
    dtrain,
    num_boost_round=999,
    seed=42,
    nfold=5,
    metrics={'auc'},
    early_stopping_rounds=10
)
cv_results

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.843357,0.005258,0.738208,0.014341
1,0.881911,0.006186,0.764922,0.018025
2,0.895542,0.006422,0.774323,0.020917
3,0.908203,0.00903,0.777084,0.019125
4,0.919838,0.008381,0.783305,0.018396
5,0.929269,0.007519,0.780366,0.018157
6,0.938783,0.008875,0.78164,0.016003
7,0.945691,0.007295,0.78453,0.016583
8,0.949663,0.00685,0.785485,0.015185
9,0.953918,0.007806,0.786649,0.017125


In [122]:
# Count true / false positives / negatives
tn, fp, fn, tp = confusion_matrix(y_train, pred_train).ravel()
print(tn, fp, fn, tp)

1077 4 22 657


In [123]:
tn, fp, fn, tp = confusion_matrix(y_test, pred_test).ravel()
print(tn, fp, fn, tp)

220 39 66 115


In [120]:
# Calculate accuracy metrics
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1081
           1       0.99      0.97      0.98       679

    accuracy                           0.99      1760
   macro avg       0.99      0.98      0.98      1760
weighted avg       0.99      0.99      0.99      1760



In [121]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.77      0.85      0.81       259
           1       0.75      0.64      0.69       181

    accuracy                           0.76       440
   macro avg       0.76      0.74      0.75       440
weighted avg       0.76      0.76      0.76       440

