In [70]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

from joblib import dump

In [3]:
# Read all the datasets
df1 = pd.read_csv("./dataset/gossipcop_real.csv", usecols = ['title'])
df2 = pd.read_csv("./dataset/politifact_real.csv", usecols = ['title'])
df3 = pd.read_csv("./dataset/gossipcop_fake.csv", usecols = ['title'])
df4 = pd.read_csv("./dataset/politifact_fake.csv", usecols = ['title'])

In [4]:
# Combine real and fake data into separate datasets
df_real = pd.concat([df1, df2], axis=0)
df_fake = pd.concat([df3, df4], axis=0)

In [5]:
# Check for empty and NA values
df_real[df_real.loc[:,:] == ' '] = np.NaN
print(df_real.isna().sum())
df_fake[df_fake.loc[:,:] == ' '] = np.NaN
print(df_fake.isna().sum())

title    0
dtype: int64
title    0
dtype: int64


In [6]:
# Print the length of both the datasets
print(len(df_real))
print(len(df_fake))

17441
5755


In [7]:
# Remove sentences having less than 5 words
df_real = df_real[~df_real.title.str.count('\s+').lt(4)]

# Add label column for real news
df_real['label'] = 0
df_real

Unnamed: 0,title,label
0,Teen Mom Star Jenelle Evans' Wedding Dress Is ...,0
1,Kylie Jenner refusing to discuss Tyga on Life ...,0
3,I Tried Kim Kardashian's Butt Workout & Am For...,0
4,Celine Dion donates concert proceeds to Vegas ...,0
5,"Chris Evans, Millie Bobby Brown, Snoop Dogg an...",0
...,...,...
618,"Trump asking Congress, not Mexico, to pay for ...",0
619,Flake: “Religious tests should have no place i...,0
620,Change We Can Believe In,0
621,deputy director of national health statistics ...,0


In [8]:
# Remove sentences having less than 5 words
df_fake = df_fake[~df_fake.title.str.count('\s+').lt(4)]

# Add label column for fake news
df_fake['label'] = 1
df_fake

Unnamed: 0,title,label
0,Did Miley Cyrus and Liam Hemsworth secretly ge...,1
1,Paris Jackson & Cara Delevingne Enjoy Night Ou...,1
2,Celebrities Join Tax March in Protest of Donal...,1
3,Cindy Crawford's daughter Kaia Gerber wears a ...,1
4,Full List of 2018 Oscar Nominations – Variety,1
...,...,...
426,BUSTED: Russian Mansions Obama Seized Were Mea...,1
427,Who is affected by the government shutdown?,1
428,Lindsey Graham Threatens To Convert To Democra...,1
429,ELECTORAL COLLEGE ELECTOR COMMITS SUICIDE TO A...,1


In [9]:
# TODO: Deal with class imbalance

In [10]:
# Combine both the real and fake news dataset
df = pd.concat([df_real, df_fake])

# Shuffle the final dataset
df = df.sample(frac=1)
df

Unnamed: 0,title,label
4654,DWTS Season 27 Results: Week 8 - The Semi-Finals,0
4879,Marvel v DC: Dawn of Rivalry,0
13422,Jessica Alba Uses This Product to Fix Creases ...,0
6975,Why Did NeNe Leakes Return To 'The Real Housew...,0
609,Sienna Miller “Cozying Up” To Ben Affleck For ...,1
...,...,...
8457,Justin Bieber Wants to Be With Selena Gomez Bu...,0
153,Trump on Revamping the Military: We’re Bringin...,1
12369,"Hugh Hefner, the Pajama Man",0
12646,Demi Lovato Shares Details of Final Interventi...,0


In [11]:
# Count the distinct label values
df.label.value_counts()

0    16460
1     5433
Name: label, dtype: int64

In [None]:
# Download stop words
nltk.download('stopwords')

In [None]:
# Download WordNet (lexical database for English language)
nltk.download('wordnet')

In [17]:
# Text preprocessing
def preprocessing(tweet):
  text = BeautifulSoup(tweet).get_text() # Remove HTML tags
  text = re.sub("[^a-zA-Z]", " ", text) # Remove special characters
  text = re.sub('((www.[^s]+)|(https?://[^s]+))',' ', text) # Remove URLs
  text = text.lower().split() # Convert to lowercase and split each word

  stop_w = set(stopwords.words("english")) # Use a set instead of list for faster searching
  text = [w for w in text if not w in stop_w] # Remove stop words
  text = [WordNetLemmatizer().lemmatize(w) for w in text] # Lemmatization

  return (" ".join(text)) # Return the words after joining each word separated by space

In [18]:
# Clean the textual data
df['title'] = df['title'].apply(lambda text: preprocessing(text))
df['title'].head()

4654                    dwts season result week semi final
4879                              marvel v dc dawn rivalry
13422         jessica alba us product fix crease concealer
6975     nene leakes return real housewife atlanta seas...
609           sienna miller cozying ben affleck movie role
Name: title, dtype: object

In [19]:
# Split into train and test sets
train, test = train_test_split(df, test_size = 0.2, stratify = df['label'], random_state = 42)

In [20]:
# Get the shape of datasets after splitting
train.shape, test.shape

((17514, 2), (4379, 2))

In [49]:
#  Create a TF-IDF vectorizer object
tfidf_vec = TfidfVectorizer(max_features=10000, ngram_range=(1, 4))

# Data fitting and transformation
train_df = tfidf_vec.fit_transform(train.title)
test_df  = tfidf_vec.transform(test.title)

In [64]:
# Model 1 - Logistic Regressor
logis_reg = LogisticRegression()
logis_reg.fit(train_df, train.label)

# Evaluation metrics
y_pred = logis_reg.predict(test_df)
print(classification_report(test.label, y_pred, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.85      0.96      0.91      3292
    Positive       0.82      0.50      0.62      1087

    accuracy                           0.85      4379
   macro avg       0.84      0.73      0.76      4379
weighted avg       0.84      0.85      0.83      4379



In [65]:
# Model 2 - Linear Support Vector Classifier
linear_svc = LinearSVC()
linear_svc.fit(train_df, train.label)
y_pred = linear_svc.predict(test_df)

# Evaluation metrics
print(classification_report(test.label, y_pred, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.88      0.92      0.90      3292
    Positive       0.72      0.60      0.66      1087

    accuracy                           0.84      4379
   macro avg       0.80      0.76      0.78      4379
weighted avg       0.84      0.84      0.84      4379



In [66]:
# Define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 4))),
                            ('model', LinearSVC())])

# Fit the pipeline model with the training data                            
pipeline.fit(train.title, train.label)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=10000, ngram_range=(1, 4))),
                ('model', LinearSVC())])

In [67]:
# Predict the label using the pipeline
pipeline.predict(['BREAKING: FIFA Declares Bankruptcy Over Kneeling Thugs'])

array([1], dtype=int64)

In [68]:
# Dump the pipeline model
dump(pipeline, filename = "fake_news_classification.joblib")

['fake_news_classification.joblib']