In [33]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from joblib import dump

In [2]:
# Read all the datasets
df1 = pd.read_csv("./dataset/gossipcop_real.csv", usecols = ['title'])
df2 = pd.read_csv("./dataset/politifact_real.csv", usecols = ['title'])
df3 = pd.read_csv("./dataset/gossipcop_fake.csv", usecols = ['title'])
df4 = pd.read_csv("./dataset/politifact_fake.csv", usecols = ['title'])

In [3]:
# Combine real and fake data into separate datasets
df_real = pd.concat([df1, df2], axis=0)
df_fake = pd.concat([df3, df4], axis=0)

In [4]:
# Check for empty and NA values
df_real[df_real.loc[:,:] == ' '] = np.NaN
print(df_real.isna().sum())
df_fake[df_fake.loc[:,:] == ' '] = np.NaN
print(df_fake.isna().sum())

title    0
dtype: int64
title    0
dtype: int64


In [5]:
# Print the length of both the datasets
print(len(df_real))
print(len(df_fake))

17441
5755


In [6]:
# Remove sentences having less than 5 words
df_real = df_real[~df_real.title.str.count('\s+').lt(4)]

# Add label column for real news
df_real['label'] = 0
df_real

Unnamed: 0,title,label
0,Teen Mom Star Jenelle Evans' Wedding Dress Is ...,0
1,Kylie Jenner refusing to discuss Tyga on Life ...,0
3,I Tried Kim Kardashian's Butt Workout & Am For...,0
4,Celine Dion donates concert proceeds to Vegas ...,0
5,"Chris Evans, Millie Bobby Brown, Snoop Dogg an...",0
...,...,...
618,"Trump asking Congress, not Mexico, to pay for ...",0
619,Flake: “Religious tests should have no place i...,0
620,Change We Can Believe In,0
621,deputy director of national health statistics ...,0


In [7]:
# Remove sentences having less than 5 words
df_fake = df_fake[~df_fake.title.str.count('\s+').lt(4)]

# Add label column for fake news
df_fake['label'] = 1
df_fake

Unnamed: 0,title,label
0,Did Miley Cyrus and Liam Hemsworth secretly ge...,1
1,Paris Jackson & Cara Delevingne Enjoy Night Ou...,1
2,Celebrities Join Tax March in Protest of Donal...,1
3,Cindy Crawford's daughter Kaia Gerber wears a ...,1
4,Full List of 2018 Oscar Nominations – Variety,1
...,...,...
426,BUSTED: Russian Mansions Obama Seized Were Mea...,1
427,Who is affected by the government shutdown?,1
428,Lindsey Graham Threatens To Convert To Democra...,1
429,ELECTORAL COLLEGE ELECTOR COMMITS SUICIDE TO A...,1


In [8]:
# TODO: Deal with class imbalance

In [9]:
# Combine both real and fake news dataset
df = pd.concat([df_real, df_fake])

#Shuffle the final dataset
df = df.sample(frac=1)
df

Unnamed: 0,title,label
4202,"Rosie Huntington-Whiteley Is Pregnant in ""Mad ...",1
16321,Margot Robbie Leads the Best Dressed Stars of ...,0
245,Kanye West Admits He Was 'Hurt' When JAY-Z and...,1
3469,Meghan Markle arrives fashionably late in Card...,0
6449,Kim Kardashian Glitters in Gold as She Joins K...,0
...,...,...
7995,Oprah Winfrey Reveals The Adorable Message She...,0
243,Ellen DeGeneres beams as wife Portia de Rossi ...,1
552,The Democratic Debate in Cleveland,0
2123,Justin Bieber's Mom Defends Him in Light of Di...,1


In [11]:
df.label.value_counts()

0    16460
1     5433
Name: label, dtype: int64

In [19]:
train, test = train_test_split(df, test_size = 0.2, stratify = df['label'], random_state = 42)

In [20]:
train.shape, test.shape

((17514, 2), (4379, 2))

In [21]:
tfidf_vec = TfidfVectorizer(lowercase= True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)

train_df = tfidf_vec.fit_transform(train.title)
test_df  = tfidf_vec.transform(test.title)

In [22]:
# create the object of LinearRegression Model
model = LogisticRegression()

# fit the model with the training data
model.fit(train_df, train.label)

# predict the label on the traning data
predict_train = model.predict(train_df)

# predict the model on the test data
predict_test = model.predict(test_df)

In [24]:
# f1 score on train data
f1_score(y_true = train.label, y_pred = predict_train)

0.613681521430599

In [25]:
# f1 score on test data
f1_score(y_true = test.label, y_pred = predict_test)

0.5674740484429066

In [27]:
# define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True,
                                                      max_features=1000,
                                                      stop_words= ENGLISH_STOP_WORDS)),
                            ('model', LogisticRegression())])

# fit the pipeline model with the training data                            
pipeline.fit(train.title, train.label)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another', 'any',
                                                       'anyhow', 'anyone',
           

In [32]:
# predict the label using the pipeline
pipeline.predict(['BREAKING: FIFA Declares Bankruptcy Over Kneeling Thugs'])

array([1], dtype=int64)

In [35]:
dump(pipeline, filename = "fake_news_classification.joblib")

['fake_news_classification.joblib']