In [6]:
import nltk
import spacy
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from nltk.corpus import stopwords
from nltk import wordpunct_tokenize

from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, f1_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [3]:
data = pd.read_csv('./raw_data/fulltrain.csv', header=None)
data.columns = ['cls', 'text']
data.head()

Unnamed: 0,cls,text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [4]:
cls_names = { 0 : "satire", 1 : "hoax", 2 : "propaganda", 3 : "reliable"}
data['cls'] = data['cls'] - 1
data['cls'].map(cls_names).value_counts()

propaganda    17870
satire        14047
reliable       9995
hoax           6942
Name: cls, dtype: int64

In [14]:
test_data = pd.read_csv('./raw_data/balancedtest.csv', header=None)
test_data.columns = ['cls', 'text']
test_data['cls'] = test_data['cls'] - 1
test_data['cls'].map(cls_names).value_counts()

satire        750
hoax          750
propaganda    750
reliable      750
Name: cls, dtype: int64

In [11]:
data['num_tokens'] = data['text'].apply(lambda x: len(wordpunct_tokenize(x)))

2    1506
3      41
0      14
1       8
Name: cls, dtype: int64

In [12]:
data[data['num_tokens'] < 10]['cls'].value_counts()

2    1506
3      41
0      14
1       8
Name: cls, dtype: int64

In [16]:
test_data['num_tokens'] = test_data['text'].apply(lambda x: len(wordpunct_tokenize(x)))
test_data[test_data['num_tokens'] < 10]['cls'].value_counts()

1    5
Name: cls, dtype: int64

N-Gram Distribution Analysis

In [15]:
top_keywords = {}
top_counts = {}

for y, name in cls_names.items():
    
    cls_docs = data[data.cls == y]['text'].tolist()
    
    cvect = CountVectorizer(
        ngram_range=(1, 2),
        stop_words=stopwords.words('english'),
        max_df=0.8,
        min_df=10,
        max_features=500
    )
    
    tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        stop_words=stopwords.words('english'),
        max_df=0.8,
        min_df=10,
        max_features=500
    )
    
    tfidf_vect = tfidf.fit_transform(cls_docs)
    feature_array = np.array(tfidf.get_feature_names_out())
    tfidf_sorting = np.argsort(tfidf_vect.toarray()).flatten()[::-1]
    top_n = feature_array[tfidf_sorting].tolist()
    
    sum_words = cvect.fit_transform(cls_docs).sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in cvect.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    print('Class', name)
    print('Top by Count', words_freq[:20])
    print('Top by TFIDF', top_n[:20])
    
    top_keywords[name] = top_n[:500]
    top_counts[name] = words_freq[:500]

Class satire
Top by Count [('one', 11426), ('time', 11092), ('would', 9736), ('like', 9278), ('new', 7558), ('year', 7120), ('even', 6508), ('added', 6302), ('get', 6298), ('could', 6163), ('people', 5741), ('amp', 5241), ('amp amp', 5196), ('first', 5175), ('also', 5140), ('old', 5132), ('monday', 5079), ('day', 4879), ('really', 4872), ('years', 4834)]
Top by TFIDF ['finally', 'say', 'people', 'thats', 'lead', 'hit', 'else', 'fans', 'wanted', 'coming', '30', 'behind', 'top', 'however', 'pretty', 'went', 'might', 'away', 'us', 'something']
Class hoax
Top by Count [('obama', 7069), ('think', 6185), ('trump', 4915), ('one', 3612), ('president', 3583), ('according', 3194), ('video', 3052), ('people', 2878), ('would', 2770), ('reports', 2768), ('time', 2583), ('told', 2311), ('country', 2214), ('clinton', 2181), ('also', 2178), ('like', 2171), ('american', 2148), ('recent', 2128), ('new', 2127), ('us', 2114)]
Top by TFIDF ['white', 'black', 'people', 'president', 'say', 'racist', 'local',

In [17]:
for cls1 in cls_names.values():
    for cls2 in cls_names.values():
        if cls1 == cls2:
            continue
        print(f'Intersection between {cls1} & {cls2}:', len(set(top_keywords[cls1]).intersection(top_keywords[cls2])))

Intersection between satire & hoax: 306
Intersection between satire & propaganda: 295
Intersection between satire & reliable: 314
Intersection between hoax & satire: 306
Intersection between hoax & propaganda: 296
Intersection between hoax & reliable: 269
Intersection between propaganda & satire: 295
Intersection between propaganda & hoax: 296
Intersection between propaganda & reliable: 312
Intersection between reliable & satire: 314
Intersection between reliable & hoax: 269
Intersection between reliable & propaganda: 312


In [27]:
for cls1 in cls_names.values():
    for cls2 in cls_names.values():
        
        if cls1 == cls2:
            continue
        
        cls1_words = [w for w, c in top_counts[cls1]]
        cls2_words = [w for w, c in top_counts[cls2]]
        cls1_count_dict = dict(top_counts[cls1])
        cls2_count_dict = dict(top_counts[cls2])
        
        intersect_words = set(cls1_words).intersection(cls2_words)
        print(f'Intersection between {cls1} & {cls2}:', len(intersect_words))
        
        top_intersecting_words = [(w, cls1_count_dict[w], cls2_count_dict[w]) for w in intersect_words]
        
        print(sorted(top_intersecting_words, key=lambda x: (x[1], x[2]), reverse=True)[:10])

Intersection between satire & hoax: 306
[('one', 11426, 3612), ('time', 11092, 2583), ('would', 9736, 2770), ('like', 9278, 2171), ('new', 7558, 2127), ('year', 7120, 1717), ('even', 6508, 1318), ('get', 6298, 1422), ('could', 6163, 1458), ('people', 5741, 2878)]
Intersection between satire & propaganda: 295
[('one', 11426, 39459), ('time', 11092, 23596), ('would', 9736, 39135), ('like', 9278, 28146), ('new', 7558, 24965), ('year', 7120, 13522), ('even', 6508, 24487), ('get', 6298, 17275), ('could', 6163, 19254), ('people', 5741, 42190)]
Intersection between satire & reliable: 314
[('one', 11426, 9925), ('time', 11092, 5966), ('would', 9736, 10266), ('like', 9278, 4889), ('new', 7558, 8952), ('year', 7120, 10367), ('even', 6508, 3619), ('added', 6302, 2215), ('get', 6298, 3473), ('could', 6163, 5299)]
Intersection between hoax & satire: 306
[('obama', 7069, 1583), ('think', 6185, 3378), ('one', 3612, 11426), ('president', 3583, 3915), ('according', 3194, 4366), ('people', 2878, 5741), 