In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

from joblib import dump

In [2]:
# Read all the datasets
df1 = pd.read_csv("./dataset/gossipcop_real.csv", usecols = ['title'])
df2 = pd.read_csv("./dataset/politifact_real.csv", usecols = ['title'])
df3 = pd.read_csv("./dataset/gossipcop_fake.csv", usecols = ['title'])
df4 = pd.read_csv("./dataset/politifact_fake.csv", usecols = ['title'])

In [3]:
# Combine real and fake data into separate datasets
df_real = pd.concat([df1, df2], axis=0)
df_fake = pd.concat([df3, df4], axis=0)

In [4]:
# Check for empty and NA values
df_real[df_real.loc[:,:] == ' '] = np.NaN
print(df_real.isna().sum())
df_fake[df_fake.loc[:,:] == ' '] = np.NaN
print(df_fake.isna().sum())

title    0
dtype: int64
title    0
dtype: int64


In [5]:
# Remove sentences having less than 5 words
df_real = df_real[~df_real.title.str.count('\s+').lt(4)]

# Add label column for real news
df_real['label'] = 0
df_real

Unnamed: 0,title,label
0,Teen Mom Star Jenelle Evans' Wedding Dress Is ...,0
1,Kylie Jenner refusing to discuss Tyga on Life ...,0
3,I Tried Kim Kardashian's Butt Workout & Am For...,0
4,Celine Dion donates concert proceeds to Vegas ...,0
5,"Chris Evans, Millie Bobby Brown, Snoop Dogg an...",0
...,...,...
618,"Trump asking Congress, not Mexico, to pay for ...",0
619,Flake: “Religious tests should have no place i...,0
620,Change We Can Believe In,0
621,deputy director of national health statistics ...,0


In [6]:
# Remove sentences having less than 5 words
df_fake = df_fake[~df_fake.title.str.count('\s+').lt(4)]

# Add label column for fake news
df_fake['label'] = 1
df_fake

Unnamed: 0,title,label
0,Did Miley Cyrus and Liam Hemsworth secretly ge...,1
1,Paris Jackson & Cara Delevingne Enjoy Night Ou...,1
2,Celebrities Join Tax March in Protest of Donal...,1
3,Cindy Crawford's daughter Kaia Gerber wears a ...,1
4,Full List of 2018 Oscar Nominations – Variety,1
...,...,...
426,BUSTED: Russian Mansions Obama Seized Were Mea...,1
427,Who is affected by the government shutdown?,1
428,Lindsey Graham Threatens To Convert To Democra...,1
429,ELECTORAL COLLEGE ELECTOR COMMITS SUICIDE TO A...,1


In [7]:
# Combine both the real and fake news dataset
df = pd.concat([df_real, df_fake])

# Shuffle the final dataset
df = df.sample(frac=1)
df

Unnamed: 0,title,label
4232,Jennifer Aniston Justin Theroux Explosive Show...,1
11672,Andy Samberg wants celeb fans to guest star on...,0
12191,'Young and the Restless' actor Corey Sligh con...,0
2731,Brad Pitt & Neri Oxman: Amal Clooney Reportedl...,1
2579,Kanye West Is Reportedly Heading For Another B...,1
...,...,...
4115,Kim Kardashian Says Her Waist Is 24 Inches—And...,0
7118,Serena Williams' Royal Wedding Reception Look ...,0
12388,Why Lisa Vanderpump Won’t Officiate Jax Taylor...,0
9790,Best Style Moments From the 2018 Grammys,0


In [8]:
# Count the distinct label values
df.label.value_counts()

0    16460
1     5433
Name: label, dtype: int64

In [9]:
# Download stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# Download WordNet (lexical database for English language)
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# Text preprocessing
def preprocessing(tweet):
  text = BeautifulSoup(tweet).get_text() # Remove HTML tags
  text = re.sub("[^a-zA-Z#]", " ", text) # Remove special characters
  text = re.sub('((www.[^s]+)|(https?://[^s]+))',' ', text) # Remove URLs
  text = text.lower().split() # Convert to lowercase and split each word

  stop_w = set(stopwords.words("english")) # Use a set instead of list for faster searching
  text = [w for w in text if not w in stop_w] # Remove stop words
  text = [WordNetLemmatizer().lemmatize(w) for w in text] # Lemmatization

  return (" ".join(text)) # Return the words after joining each word separated by space

In [12]:
# Clean the textual data
df['title'] = df['title'].apply(lambda text: preprocessing(text))
df['title'].head()

4232     jennifer aniston justin theroux explosive show...
11672    andy samberg want celeb fan guest star brookly...
12191    young restless actor corey sligh convicted chi...
2731     brad pitt neri oxman amal clooney reportedly i...
2579     kanye west reportedly heading another breakdow...
Name: title, dtype: object

In [13]:
# Split into train and test sets
train, test = train_test_split(df, test_size = 0.2, stratify = df['label'], random_state = 42)

# Get the shape of datasets after splitting
train.shape, test.shape

((17514, 2), (4379, 2))

In [14]:
# Create a TF-IDF vectorizer object
tfidf_vec = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))

# Data fitting and transformation
train_df = tfidf_vec.fit_transform(train.title)
test_df  = tfidf_vec.transform(test.title)

In [15]:
# Before over-sampling the minority class
train['label'].value_counts()

0    13168
1     4346
Name: label, dtype: int64

In [16]:
# Use SMOTE (Synthetic Minority Oversampling Technique) for dealing with class imbalance
smt = SMOTE(random_state = 18, sampling_strategy = 1.0)
smt_xtrain_df, smt_ytrain = smt.fit_resample(train_df, train.label)

In [17]:
# After over-sampling the minority class
smt_ytrain.value_counts()

0    13168
1    13168
Name: label, dtype: int64

In [18]:
# Model 1 - Logistic Regressor
logis_reg = LogisticRegression()
logis_reg.fit(smt_xtrain_df, smt_ytrain)

y_pred = logis_reg.predict(test_df)

# Evaluation metrics
print(classification_report(test.label, y_pred, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.91      0.85      0.88      3292
    Positive       0.63      0.75      0.68      1087

    accuracy                           0.83      4379
   macro avg       0.77      0.80      0.78      4379
weighted avg       0.84      0.83      0.83      4379



In [19]:
# Model 2 - Linear Support Vector Classifier
linear_svc = LinearSVC()
linear_svc.fit(smt_xtrain_df, smt_ytrain)

y_pred = linear_svc.predict(test_df)

# Evaluation metrics
print(classification_report(test.label, y_pred, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.90      0.85      0.87      3292
    Positive       0.61      0.71      0.65      1087

    accuracy                           0.81      4379
   macro avg       0.75      0.78      0.76      4379
weighted avg       0.83      0.81      0.82      4379



In [20]:
# Define the steps of the pipeline
pipeline = imbpipeline(steps = [['tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 3))],
                                ['smote', SMOTE(random_state = 18, sampling_strategy = 1.0)],
                                ['classifier', LogisticRegression()]])

# Fit the pipeline model with the training data                            
pipeline.fit(train.title, train.label)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=10000, ngram_range=(1, 3))),
                ('smote', SMOTE(random_state=18, sampling_strategy=1.0)),
                ['classifier', LogisticRegression()]])

In [21]:
# Predict the label using the pipeline
check = pipeline.predict(['BREAKING: FIFA Declares Bankruptcy Over Lack Of Sponsorships'])

out = "Possible Fake News" if check==1 else "Possible Real News"
print(out)

Possible Fake News


In [22]:
# Dump the pipeline model
dump(pipeline, filename = "fake_news_classification.joblib")

['fake_news_classification.joblib']