## Importing Necessary Libraries

In [1]:
# ! pip install gensim
# ! pip install vaderSentiment
# ! pip install text2emotion
# ! pip install tqdm
#! pip install bs4

In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import os
import datetime
import string
import pickle
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, STOPWORDS

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Monster\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Data Cleaning

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Monster\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Importing stopwords and modifying as per the need
from nltk.corpus import stopwords
stop_words = stopwords.words('English')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'said', 'reuters'])
stop_words.remove("no")
stop_words.remove("not")
stop_words.remove("very")
stop_words.remove('nor')

In [5]:
# Function to handle missing values
def missing_values(df):
    return df.fillna(" ", inplace = True)

In [6]:
#Function to count number of stopwords present in the article
def no_of_stopwords(phrase_list):
    no_of_sw = 0
    for token in phrase_list:
        if token in gensim.parsing.preprocessing.STOPWORDS:
            no_of_sw += 1
    return no_of_sw

In [7]:
#Function to count number of question marks present in the article.
def no_of_quesMarks(phrase):
    i = 0
    for char in phrase:
        if char == '?':
            i += 1
    return i

In [8]:
#Function to count number of exclamation marks present in the article.
def no_of_exclamation(phrase):
    i = 0
    for char in phrase:
        if char == '!':
            i += 1
    return i

In [9]:
# Parts of speech tagging....Here I am counting number of different pos present in the article
from tqdm import tqdm
def pos_tagging(df):
    # Creating list to store count of different pos
    cc_cd_lst =[]                   
    determiner_lst = []
    foreign_w_lst = []
    conj_prep_lst = []
    adjective_lst = []
    modal_lst = []
    noun_lst = []
    adverb_lst = []
    verb_lst = []
    print("POS TAGGING")
    for article in tqdm(df):
        lst = nltk.pos_tag(article.split()) # Using NLTK pos tagger
        cc_cd = determiner = foreign_w = conj_prep = adjective = modal = noun =  adverb = verb = 0
        for tag in lst:
            if tag[1] in ['CC', 'CD']:
                cc_cd += 1
            elif tag[1] == 'DT':
                determiner += 1
            elif tag[1] == 'FW':
                foreign_w += 1
            elif tag[1] == 'IN':
                conj_prep += 1
            elif tag[1] in ['JJ', 'JJR', 'JJS']:
                adjective += 1
            elif tag[1] == 'MD':
                modal += 1
            elif tag[1] in ['NN', 'NNS', 'NP', 'NPS']:
                noun += 1
            elif tag[1] in ['RB','RBR','RBS','RP']:
                adverb += 1
            elif tag[1] in ['VB','VBD','VBG','VBN','VBP', 'VBZ']:
                verb += 1
        
        cc_cd_lst.append(cc_cd)
        determiner_lst.append(determiner)
        foreign_w_lst.append(foreign_w)
        conj_prep_lst.append(conj_prep)
        adjective_lst.append(adjective)
        modal_lst.append(modal)
        noun_lst.append(noun)
        adverb_lst.append(adverb)
        verb_lst.append(verb)
    return np.array([cc_cd_lst, determiner_lst, foreign_w_lst, conj_prep_lst, adjective_lst, modal_lst, noun_lst, adverb_lst, verb_lst]).T

In [10]:
# https://github.com/cjhutto/vaderSentiment#:~:text=Notifications-,VADER%20Sentiment%20Analysis.,on%20texts%20from%20other%20domains.

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def sentiment_scores(sentence):
 
    # Create a SentimentIntensityAnalyzer object. A polarity_scores method of SentimentIntensityAnalyzer
    # {'neg': 0.198, 'neu': 0.652, 'pos': 0.149, 'compound': -0.9981}
    sid_obj = SentimentIntensityAnalyzer()
    
    sentiment_dict = sid_obj.polarity_scores(sentence) # return a dictionary which contains pos, neg, neu, and compound scores.
    
    if sentiment_dict['compound'] >= 0.05 :
        x = "Positive"
 
    elif sentiment_dict['compound'] <= - 0.05 :
        x = "Negative"
 
    else :
        x = "Neutral"
    
    return x

In [11]:
import text2emotion as te
def text2emotion(article):
    emotion = te.get_emotion(article)
    
    return max(emotion, key = emotion.get)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Monster\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Monster\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Monster\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


1. https://pdf.sciencedirectassets.com/280203/1-s2.0-S1877050920X00093/1-s2.0-S1877050920312394/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEC8aCXVzLWVhc3QtMSJGMEQCIAlAZ1MPWoqC4rON4Tfg8MWU71TWYnjLdInBbY3cAcK4AiACnKx8%2BuDCkZejlN8YLwlk8G2xSnnKFJ%2FHh%2FtJNBD3tyr6AwgXEAQaDDA1OTAwMzU0Njg2NSIMRt4D%2BkZ2rGCUyBEgKtcDFZasgD9rvzpzZ6%2FGdTDB9YYH3pXs9mnWdxssp00b1J9sGlvISw9r7c%2Fd%2BkDDlE3StYkt2uk4ISgFjWk1PAH%2BJvggVG2ViWqEZTiylRg760s%2Fa5Um9OQaTtAZZyWZeXoKR7mbeEBUklBFuAZE%2Bcu6V1sFXxaXV1dVQSULl7%2BFGk4t8dS2gOrmsBRYNGTciJLkxTcyl3nQ7d%2FQQhOA38sQWsGbyJHlXsBhMfzCbXlxIJIMJBe5h%2FcFknp%2FFpbzLFodhtoYYGwWrsVNWFrsbIaaSdTI7VpOM8shexgTocFgD%2F88Ntzd08CN1Jnqyga8mOeK%2Fv7hieZxTVnlekyDI1ZNwHB%2B0cBC%2FV1IFQrVwdW1SZ95LsMTAvw5J3HvjcN5eJZZFGq9JxIqn1Xvp%2B7oiWeEdZu0%2B0AAgcf%2BF7y4HeS%2FETTn4%2BTV3bCslJxRR96PfS2WQACNogAU%2Be%2F0LVdQzbraKNQ6D%2BYMeZx%2BYDnx6R0vyPjIDiMiuaTa5bhgBwr29eth9YZPN3U5WFSMPv5WXWNwPlyzhuFCZcnS%2Fnj1ZyJqTZob3KaPVWvI0jHVD%2BcR4r5kUSSI2SpSLfFvulIa5yxyLnTOZD8%2FmCeijIwUJGbjliwt8ljGFR%2FLMLHm0o0GOqYBYZ4cpKQv1Si2Jo94a%2FDkLukgjRh%2BTTqVkxt2Wh9QhuZEeI%2BjM6MvmztdtBkdY4%2FcQGxldw8En2y8WGq4awbKCo3juV21cXTRZGAQXUD43lO2bGF26%2F5ajqoh88eyOICd%2BsNH4ov6jsp375HQUOR0Ba%2FQK%2BbKqIqz2f3Cn%2FVZwVgf3%2B8LY%2BsjAnP%2BRlQmJNi00hH5JRE7N%2FPq4kDZjkNJPwuju5oWyA%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20211211T144440Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTYT3WHAHFH%2F20211211%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=a29ac13a5f0a28d0462172bd3e198aa4d393427edcb15fcbbd96419cd645f82a&hash=1602c77c9d3c9c917ea7ed2f2a7125c4be8d9866b5b08f782e06db5f2a81217b&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S1877050920312394&tid=spdf-ede68e5d-fcc9-4bb5-b6eb-3ef3c523b8a5&sid=9c753a7092e7a44ca21ba6f0d3ae280a9ee2gxrqb&type=client

2. https://medium.com/clement-ong/fake-news-detection-4713eb3b1cd6

In [12]:
# Extracting few features before removing stopwords/Lemmetization/special characters

from nltk import sent_tokenize
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

def prefeaturization(df):
    df['cleaned_article'] = df['cleaned_article'].str.lower() # Converting string into lower
       
    print("Number of punctuations") # Creating new feature of punctuation using string predefined punctuation method
    df["num_punctuations"] =df['cleaned_article'].progress_apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    
    # Calling above defined no_of_stopwards function for counting stopwords per article for feature creation
    print('No of Stopwords')
    df['no_of_stopwords'] = df['cleaned_article'].progress_apply(no_of_stopwords) 
    
    print('No of question marks')
    df['no_of_quesMarks'] = df['cleaned_article'].progress_apply(no_of_quesMarks)
    
    print("Number of Exclamation Marks")
    df['no_of_exclamation'] = df['cleaned_article'].progress_apply(no_of_exclamation)
    
    print("No of Sentences")
    df['no_of_sentence'] = df['cleaned_article'].progress_apply(sent_tokenize).apply(lambda x: len(x))
    
    
    print("Sentiment Analysis")
    df['sentiment'] = df['cleaned_article'].progress_apply(sentiment_scores)
    
    #print("Text Emotion")
    #df['emotion'] = df['cleaned_article'].progress_apply(text2emotion)
    pos = pd.DataFrame(pos_tagging(df['cleaned_article']), columns= ['cc_cd', 'determiner', 'foreign_w', 'conj_prep', 'adjective', 'modal', 'noun',  'adverb', 'verb'])
    df_new = pd.concat([df, pos], axis = 1)
    return df_new

In [13]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

# Removing stop words and all those words which has length less than 3
def preprocessing(phrase):
    final_article = ''
    for token in phrase.split():
        if len(token)>3 and token not in stop_words:
            final_article += token + ' '
    return final_article.strip()

In [14]:
# Removing links
import re
def Link_removal(data):
    return re.sub("http\S+", "", data )

In [15]:
# Removing html tags
from bs4 import BeautifulSoup

def Tag_removal(sentence):
    soup = BeautifulSoup(sentence, 'html.parser')
    text = soup.get_text()
    return text

In [16]:
# Converting contraction words like was't, don't, etc. into normal english words like was not, do not etc.
def Decontracted(sentence):
    # Specific
    sentence = re.sub("won't", "will not", sentence)
    sentence = re.sub("can't", "can not", sentence)
    
    # General
    sentence = re.sub(r"n\'t", " not", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'s", " is", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'t", " not", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'m", " am", sentence)
    return sentence


In [17]:
#removing the commoner morphological and inflexional endings from words in English.
from nltk.stem import PorterStemmer
def stemming(sentence):
    ps = PorterStemmer()
    article = ""
    for word in sentence.split():
        word = str(word)
        article = article + ps.stem(word)+ " "
    return article.strip()

In [18]:
#Removing words with digits and special characters
def Special_char(sentence):
    article = ""
    for word in sentence.split():
        word = str(word)
        word = re.sub("\S*\d\S*", "", word)
        word = re.sub('[^A-Za-z0-9]+', " ", word)
        word = word.replace(" ", "")
        article = article + word + ' '
    return article.strip()

In [19]:
from nltk.stem import WordNetLemmatizer

def lemma(phrase):
    article = ""
    lemmatizer = WordNetLemmatizer()
    for word in phrase.split():
        article += lemmatizer.lemmatize(word) + ' '
    return article.strip()

In [20]:
# Calculating average word length of the article
def get_avg_wordlen(article):
    words = article.split()
    word_len = 0
    if len(words) > 0:  # To avoid zero divisional error
        for word in words:
            word_len += len(word)
    else: 
        return 0
    return word_len/len(words)

In [21]:
# Function to remove all those articles which does not contain any meaningfull word
def drop_noword_article(df):
    df.drop(index = list(df[df['article_len']==0].index),inplace = True, axis = 0)
    return df

In [22]:
def negation_count(article):
    neg_words = ['no', 'not', 'none', 'nothing', 'neither', 'never', 'hardly', 'scarcely', 'barely']
    counter = 0
    for token in article.split():
        if token in neg_words:
            counter += 1
    return counter        

In [23]:
def data_cleaning(df):
    df.apply(missing_values)
    print("Cleaning......")
    df['cleaned_article'] = df['cleaned_article'].str.lower()
    df['cleaned_article'] = df['cleaned_article'].progress_apply(Link_removal)
    df['cleaned_article'] = df['cleaned_article'].progress_apply(Tag_removal)
    df['cleaned_article'] = df['cleaned_article'].progress_apply(Decontracted)
    
    print("Pre-Featurization")
    df = prefeaturization(df)
    
    print("Cleaning......")
    print("Special Char")
    df['cleaned_article'] = df['cleaned_article'].progress_apply(Special_char)
    
    # Computing length on the article
    print("Article Length")
    df['article_len'] = df['cleaned_article'].progress_apply(lambda x: len(x))
    
    print("Lemmetization")
    df['cleaned_article'] = df['cleaned_article'].progress_apply(lemma)
    
    # Counting number of negative words used in the article
    print("Negative Words...")
    df['negations'] = df['cleaned_article'].progress_apply(lambda x: negation_count(x))
    
    print("Preprocessing")
    df['cleaned_article'] = df['cleaned_article'].progress_apply(preprocessing) # Stopwords removal
    
    return df

## Feature Engineering

In [24]:
from textblob import TextBlob
def feature_engineering(df):
    
    # Calculating polarity score using textblob
    print("Polarity")
    df['polarity'] = df['cleaned_article'].progress_apply(lambda x: TextBlob(x).sentiment.polarity)
    
    # Counting number of words present in cleaned articles
    print("Word Count")
    df['word_count'] = df['cleaned_article'].progress_apply(lambda x: len(x.split()))
    
    # Averge length of each word present in the cleaned article
    print("Average Word Count")
    df['avg_word_len'] = df['cleaned_article'].progress_apply(lambda x: get_avg_wordlen(x))
    
    # Counting number of unique words present in the article
    print("Number of Unique Words")
    df["num_unique_words"] = df["cleaned_article"].progress_apply(lambda x: len(set(str(x).split())))
    
    # Counting number of characters present in the article
    print("Number of Chars")
    df["num_chars"] = df["cleaned_article"].progress_apply(lambda x: len(str(x)))
    return df

## Dataset 1   (Train 1)

In [25]:
# Importing dataset
df_true = pd.read_csv("Training_data/True.csv")
df_fake = pd.read_csv("Training_data/Fake.csv")
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [26]:
df_fake.tail()

Unnamed: 0,title,text,subject,date
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"
23480,10 U.S. Navy Sailors Held by Iranian Military ...,21st Century Wire says As 21WIRE predicted in ...,Middle-east,"January 12, 2016"


In [27]:
# Labeling real news as 1 and fake news as 0
df_true['label'] = 1
df_fake['label'] = 0

In [28]:
# Mergeing true and fake dataframe into one dataframe
df1 = pd.concat([df_true, df_fake], ignore_index= True)
df1

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
...,...,...,...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0


In [29]:
# Mixing/Sample the dataset so 0's and 1's are equally spread through out the dataframe and then resetting the index
df1 = df1.sample(frac = 1)
df1.reset_index(inplace = True, drop = True)
df1

Unnamed: 0,title,text,subject,date,label
0,CNN Host Tells Senate Republican To Do His Da...,Republican Senator Orrin Hatch tried to justif...,News,"February 16, 2016",0
1,At least 19 drown when boat capsizes in northe...,NEW DELHI (Reuters) - At least 19 people drown...,worldnews,"September 14, 2017",1
2,One in five Indonesian students support Islami...,JAKARTA (Reuters) - Nearly 20 percent of high ...,worldnews,"November 2, 2017",1
3,"Hillary Clinton is ‘Most Corrupt, Militaristic...",21st Century Wire says The writing is on the w...,US_News,"May 17, 2016",0
4,BRUTALLY HONEST BILLBOARD Turns Heads In State...,American citizens should be more concerned abo...,politics,"Jul 16, 2016",0
...,...,...,...,...,...
44893,ILLEGAL ALIEN COLLEGE STUDENTS Protest For A F...,Social Media went nuts when illegal alien coll...,Government News,"Sep 17, 2016",0
44894,HOW PEOPLE MAGAZINE COVER Proves Hillary Has A...,Does anyone else get the sense Hillary is not ...,left-news,"Aug 16, 2016",0
44895,GREAT! PRO-COAL OKLAHOMA AG Tapped For Head Of...,When the Environmental Protection Agency prop...,politics,"Dec 7, 2016",0
44896,U.S. congressional committee subpoenas ex-drug...,WASHINGTON/NEW YORK (Reuters) - A U.S. congres...,politicsNews,"January 20, 2016",1


In [30]:
# checking dataset has any null value
df1.isna().sum(), df1.isnull().sum()

(title      0
 text       0
 subject    0
 date       0
 label      0
 dtype: int64,
 title      0
 text       0
 subject    0
 date       0
 label      0
 dtype: int64)

In [31]:
# Replacing null values with empty string
df1.fillna(" ", inplace = True)

In [32]:
# Combining title and text into one column
df1['cleaned_article'] = df1['title'] + " " + df1['text']
df1.head(2)

Unnamed: 0,title,text,subject,date,label,cleaned_article
0,CNN Host Tells Senate Republican To Do His Da...,Republican Senator Orrin Hatch tried to justif...,News,"February 16, 2016",0,CNN Host Tells Senate Republican To Do His Da...
1,At least 19 drown when boat capsizes in northe...,NEW DELHI (Reuters) - At least 19 people drown...,worldnews,"September 14, 2017",1,At least 19 drown when boat capsizes in northe...


In [33]:
# Except cleaned_article and label dropping all other columns from data frame.
df1.drop(['title', 'text', 'subject', 'date'], axis = 1, inplace = True)
df1.head(2)

Unnamed: 0,label,cleaned_article
0,0,CNN Host Tells Senate Republican To Do His Da...
1,1,At least 19 drown when boat capsizes in northe...


In [34]:
# Performing data cleaning and pre-featurization 
df1 = data_cleaning(df1)
df1.head(2)

Cleaning......


  0%|          | 0/44898 [00:00<?, ?it/s]

  0%|          | 0/44898 [00:00<?, ?it/s]

  0%|          | 0/44898 [00:00<?, ?it/s]

Pre-Featurization
Number of punctuations


  0%|          | 0/44898 [00:00<?, ?it/s]

No of Stopwords


  0%|          | 0/44898 [00:00<?, ?it/s]

No of question marks


  0%|          | 0/44898 [00:00<?, ?it/s]

Number of Exclamation Marks


  0%|          | 0/44898 [00:00<?, ?it/s]

No of Sentences


  0%|          | 0/44898 [00:00<?, ?it/s]

Sentiment Analysis


  0%|          | 0/44898 [00:00<?, ?it/s]

POS TAGGING


100%|████████████████████████████████████████████████████████████████████████████| 44898/44898 [13:05<00:00, 57.16it/s]


Cleaning......
Special Char


  0%|          | 0/44898 [00:00<?, ?it/s]

Article Length


  0%|          | 0/44898 [00:00<?, ?it/s]

Lemmetization


  0%|          | 0/44898 [00:00<?, ?it/s]

Negative Words...


  0%|          | 0/44898 [00:00<?, ?it/s]

Preprocessing


  0%|          | 0/44898 [00:00<?, ?it/s]

Unnamed: 0,label,cleaned_article,num_punctuations,no_of_stopwords,no_of_quesMarks,no_of_exclamation,no_of_sentence,sentiment,cc_cd,determiner,foreign_w,conj_prep,adjective,modal,noun,adverb,verb,article_len,negations
0,0,host tell senate republican damn video republi...,84,630,2,0,36,Negative,27,101,0,108,83,12,247,46,196,5166,11
1,1,drown boat capsizes northern india police delh...,19,112,0,0,5,Negative,8,10,0,15,20,0,35,2,26,718,1


In [35]:
df1

Unnamed: 0,label,cleaned_article,num_punctuations,no_of_stopwords,no_of_quesMarks,no_of_exclamation,no_of_sentence,sentiment,cc_cd,determiner,foreign_w,conj_prep,adjective,modal,noun,adverb,verb,article_len,negations
0,0,host tell senate republican damn video republi...,84,630,2,0,36,Negative,27,101,0,108,83,12,247,46,196,5166,11
1,1,drown boat capsizes northern india police delh...,19,112,0,0,5,Negative,8,10,0,15,20,0,35,2,26,718,1
2,1,indonesian student support islamic caliphate s...,54,456,0,0,20,Positive,23,45,0,64,62,2,144,14,74,2971,0
3,0,hillary clinton corrupt militaristic candidate...,9,78,0,0,2,Positive,3,11,0,10,17,1,30,1,17,560,0
4,0,brutally honest billboard turn head state expl...,34,286,1,0,11,Negative,7,45,0,47,27,4,120,12,49,2049,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44893,0,illegal alien college student protest freebie ...,20,116,2,1,10,Positive,4,10,0,21,22,2,45,5,31,953,0
44894,0,people magazine cover prof hillary wildly unpo...,25,193,3,0,6,Positive,12,21,0,35,34,5,73,14,42,1517,3
44895,0,great procoal oklahoma tapped head environment...,32,190,0,1,10,Positive,8,17,0,23,26,0,80,5,32,1318,0
44896,1,congressional committee subpoena exdrug shkrel...,66,365,1,0,17,Negative,9,37,1,59,55,4,130,13,77,2648,1


In [36]:
# Creating features after data cleaning
feature_engineering(df1)

Polarity


  0%|          | 0/44898 [00:00<?, ?it/s]

Word Count


  0%|          | 0/44898 [00:00<?, ?it/s]

Average Word Count


  0%|          | 0/44898 [00:00<?, ?it/s]

Number of Unique Words


  0%|          | 0/44898 [00:00<?, ?it/s]

Number of Chars


  0%|          | 0/44898 [00:00<?, ?it/s]

Unnamed: 0,label,cleaned_article,num_punctuations,no_of_stopwords,no_of_quesMarks,no_of_exclamation,no_of_sentence,sentiment,cc_cd,determiner,...,noun,adverb,verb,article_len,negations,polarity,word_count,avg_word_len,num_unique_words,num_chars
0,0,host tell senate republican damn video republi...,84,630,2,0,36,Negative,27,101,...,247,46,196,5166,11,0.165890,371,6.636119,223,2832
1,1,drown boat capsizes northern india police delh...,19,112,0,0,5,Negative,8,10,...,35,2,26,718,1,-0.150000,62,6.532258,44,466
2,1,indonesian student support islamic caliphate s...,54,456,0,0,20,Positive,23,45,...,144,14,74,2971,0,0.038395,250,7.428000,171,2106
3,0,hillary clinton corrupt militaristic candidate...,9,78,0,0,2,Positive,3,11,...,30,1,17,560,0,0.022159,51,6.725490,37,393
4,0,brutally honest billboard turn head state expl...,34,286,1,0,11,Negative,7,45,...,120,12,49,2049,1,0.015278,174,7.155172,123,1418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44893,0,illegal alien college student protest freebie ...,20,116,2,1,10,Positive,4,10,...,45,5,31,953,0,-0.015278,73,6.726027,60,563
44894,0,people magazine cover prof hillary wildly unpo...,25,193,3,0,6,Positive,12,21,...,73,14,42,1517,3,0.099242,126,6.992063,92,1006
44895,0,great procoal oklahoma tapped head environment...,32,190,0,1,10,Positive,8,17,...,80,5,32,1318,0,0.142857,111,7.135135,84,902
44896,1,congressional committee subpoena exdrug shkrel...,66,365,1,0,17,Negative,9,37,...,130,13,77,2648,1,-0.158543,211,7.374408,151,1766


In [37]:
# There were few rows in article which has only blank space and no words. Therefore, we are dropping those rows
#drop_noword_article(df1)

In [38]:
#df1[df1['article_len']==0]

In [39]:
df1['cleaned_article'][0]

'host tell senate republican damn video republican senator orrin hatch tried justify chris cuomo itduring appearance tuesday senate judiciary committee member orrin hatch attempted defend decision senate republican refuse hold confirmation hearing president obama nominates vacant seat justice antonin scalia passed away saturdayever news broke scalia death republican clear allow president obama position bench opting instead wait year whomever elected president november office hope republicanbut host chris cuomo told hatch colleague damn constitution requires especially document conservative claim love president able pick judicial nominee election year vote judge election year final year presidency hypocrisy play cuomo bluntly like nominates process course like good little obstructionist hatch continued insist republican think real reason cuomo asked constitution hatch responded claiming president obama treated fairly senate republican know crock shit considering record obstructing presi

In [40]:
df1.to_csv("Cleaned/cleaned_data_1.csv", index = False)

## Similar steps has been performed on other 4 datasets

## Dataset 2    (Train 2)

In [41]:
df2 = pd.read_csv("Training_data/train.csv")
df2.head(2)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0


In [42]:
# As per the source in this dataset label 1 indicate news in unrelible means fake on the other hand 0 indicate relible means real new.
# 0 -> Real and 1 -> Fake
# Here I am converting label as per our problem statement.  0-> Fake and 1 -> Real

df2.label.replace({0:1, 1:0}, inplace = True)

In [43]:
df2['label'].value_counts()

0    10413
1    10387
Name: label, dtype: int64

In [44]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [45]:
df2.fillna(" ",inplace = True)
df2['cleaned_article'] = df2['title'] + " " + df2['text']

In [46]:
df2['cleaned_article'].isna().sum(), df2['cleaned_article'].isnull().sum()

(0, 0)

In [47]:
df2.dropna(inplace = True)
df2.drop(['id', 'author', 'title', 'text'], axis = 1, inplace = True)

In [48]:
df2.head()

Unnamed: 0,label,cleaned_article
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,0,Why the Truth Might Get You Fired Why the Trut...
3,0,15 Civilians Killed In Single US Airstrike Hav...
4,0,Iranian woman jailed for fictional unpublished...


In [49]:
df2 = data_cleaning(df2)
df2.head(2)

Cleaning......


  0%|          | 0/20800 [00:00<?, ?it/s]

  0%|          | 0/20800 [00:00<?, ?it/s]

  0%|          | 0/20800 [00:00<?, ?it/s]

Pre-Featurization
Number of punctuations


  0%|          | 0/20800 [00:00<?, ?it/s]

No of Stopwords


  0%|          | 0/20800 [00:00<?, ?it/s]

No of question marks


  0%|          | 0/20800 [00:00<?, ?it/s]

Number of Exclamation Marks


  0%|          | 0/20800 [00:00<?, ?it/s]

No of Sentences


  0%|          | 0/20800 [00:00<?, ?it/s]

Sentiment Analysis


  0%|          | 0/20800 [00:00<?, ?it/s]

POS TAGGING


100%|████████████████████████████████████████████████████████████████████████████| 20800/20800 [11:52<00:00, 29.19it/s]

Cleaning......
Special Char





  0%|          | 0/20800 [00:00<?, ?it/s]

Article Length


  0%|          | 0/20800 [00:00<?, ?it/s]

Lemmetization


  0%|          | 0/20800 [00:00<?, ?it/s]

Negative Words...


  0%|          | 0/20800 [00:00<?, ?it/s]

Preprocessing


  0%|          | 0/20800 [00:00<?, ?it/s]

Unnamed: 0,label,cleaned_article,num_punctuations,no_of_stopwords,no_of_quesMarks,no_of_exclamation,no_of_sentence,sentiment,cc_cd,determiner,foreign_w,conj_prep,adjective,modal,noun,adverb,verb,article_len,negations
0,0,house aide didnt comeys letter jason chaffetz ...,88,635,0,0,36,Positive,30,76,0,111,87,5,229,44,155,4848,7
1,1,flynn hillary clinton woman campus breitbart f...,77,504,4,0,29,Negative,31,73,0,107,78,5,185,39,108,4045,5


In [50]:
feature_engineering(df2)

Polarity


  0%|          | 0/20800 [00:00<?, ?it/s]

Word Count


  0%|          | 0/20800 [00:00<?, ?it/s]

Average Word Count


  0%|          | 0/20800 [00:00<?, ?it/s]

Number of Unique Words


  0%|          | 0/20800 [00:00<?, ?it/s]

Number of Chars


  0%|          | 0/20800 [00:00<?, ?it/s]

Unnamed: 0,label,cleaned_article,num_punctuations,no_of_stopwords,no_of_quesMarks,no_of_exclamation,no_of_sentence,sentiment,cc_cd,determiner,...,noun,adverb,verb,article_len,negations,polarity,word_count,avg_word_len,num_unique_words,num_chars
0,0,house aide didnt comeys letter jason chaffetz ...,88,635,0,0,36,Positive,30,76,...,229,44,155,4848,7,0.024100,381,6.874016,235,2999
1,1,flynn hillary clinton woman campus breitbart f...,77,504,4,0,29,Negative,31,73,...,185,39,108,4045,5,0.049707,314,6.777070,242,2441
2,0,truth fired truth fired october tension intell...,184,1023,4,0,50,Positive,46,134,...,347,69,218,7450,15,0.093199,576,6.925347,390,4564
3,0,civilian killed single airstrike identified vi...,51,521,0,0,26,Negative,33,67,...,138,16,119,3191,4,-0.001918,244,6.905738,146,1928
4,0,iranian woman jailed fictional unpublished sto...,16,148,0,0,5,Negative,6,19,...,53,2,36,992,2,0.039286,87,6.816092,61,679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20795,1,rapper trump poster child white supremacy rapp...,48,227,2,0,12,Positive,14,32,...,114,6,59,1780,0,0.069643,151,6.503311,120,1132
20796,1,playoff schedule matchup odds york time green ...,186,720,0,0,49,Positive,65,170,...,371,26,156,5807,19,-0.055288,452,6.103982,247,3210
20797,1,macys receive takeover approach hudson york ti...,108,636,0,0,41,Positive,46,79,...,257,32,151,4668,5,0.104762,396,6.537879,259,2984
20798,0,nato russia hold parallel exercise balkan nato...,33,268,0,0,14,Neutral,28,18,...,91,2,43,1738,1,-0.035973,155,7.083871,99,1252


In [51]:
#drop_noword_article(df2)

In [52]:
df2.to_csv("Cleaned/cleaned_data_2.csv", index = False)

## Dataset 3    (Train 3)

In [53]:
df3 = pd.read_csv("Training_data/full_dataset.csv")
df3.head(2)

Unnamed: 0,title,author,text,label
0,a news release,federation-american-immigration-reform,Unemployment has been on the rise throughout W...,1
1,"Black Turnout Soft in Early Voting, Boding Ill...",Henry Wolff,"Black Turnout Soft in Early Voting, Boding Ill...",1


In [54]:
# As per source label 1 represent fake whereas label 0 represents Real news
# Converting 0 to 1 and 1 to 0 as per our problem definition 
df3.label.replace({0:1, 1:0}, inplace = True)

In [55]:
df3['label'].value_counts()

1    16085
0    14853
Name: label, dtype: int64

In [56]:
df3.isna().sum()

title      558
author    1957
text        39
label        0
dtype: int64

In [57]:
df3.fillna(' ', inplace = True)

In [58]:
df3['cleaned_article'] = df3['title'] + " " + df3['text']

In [59]:
df3.drop(['title', 'author', 'text'], axis = 1 , inplace = True)

In [61]:
df3 = data_cleaning(df3)

Cleaning......


  0%|          | 0/30938 [00:00<?, ?it/s]

  0%|          | 0/30938 [00:00<?, ?it/s]

  0%|          | 0/30938 [00:00<?, ?it/s]

Pre-Featurization
Number of punctuations


  0%|          | 0/30938 [00:00<?, ?it/s]

No of Stopwords


  0%|          | 0/30938 [00:00<?, ?it/s]

No of question marks


  0%|          | 0/30938 [00:00<?, ?it/s]

Number of Exclamation Marks


  0%|          | 0/30938 [00:00<?, ?it/s]

No of Sentences


  0%|          | 0/30938 [00:00<?, ?it/s]

Sentiment Analysis


  0%|          | 0/30938 [00:00<?, ?it/s]

POS TAGGING


100%|████████████████████████████████████████████████████████████████████████████| 30938/30938 [10:56<00:00, 47.14it/s]

Cleaning......
Special Char





  0%|          | 0/30938 [00:00<?, ?it/s]

Article Length


  0%|          | 0/30938 [00:00<?, ?it/s]

Lemmetization


  0%|          | 0/30938 [00:00<?, ?it/s]

Negative Words...


  0%|          | 0/30938 [00:00<?, ?it/s]

Preprocessing


  0%|          | 0/30938 [00:00<?, ?it/s]

In [62]:
df3

Unnamed: 0,label,cleaned_article,num_punctuations,no_of_stopwords,no_of_quesMarks,no_of_exclamation,no_of_sentence,sentiment,cc_cd,determiner,foreign_w,conj_prep,adjective,modal,noun,adverb,verb,article_len,negations
0,0,news release unemployment rise wisconsin paul ...,1,18,0,0,1,Negative,0,2,0,4,0,0,10,1,3,131,0
1,0,black turnout soft early voting boding hillary...,47,261,0,0,11,Positive,17,25,0,41,43,2,103,15,48,1869,1
2,0,television interview milwaukee county safe year,1,14,0,0,1,Negative,2,3,0,1,1,0,6,3,2,94,1
3,0,thing learned general contractor foundation li...,258,1242,2,0,80,Positive,131,209,1,196,196,42,476,97,423,10541,19
4,1,statement responding rick scott state state sp...,4,11,0,0,3,Neutral,1,2,0,1,1,1,8,1,7,126,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30933,1,richardson passed nation giving national guard...,4,22,0,0,2,Positive,0,3,0,2,2,0,10,0,3,108,0
30934,0,khodorkovsky putin going cozy washington print...,121,553,11,0,38,Positive,19,62,0,81,69,19,184,46,161,4216,19
30935,0,coup stolen election rumor coup watching mediu...,74,378,7,6,37,Negative,18,56,0,62,69,7,164,25,130,3203,5
30936,1,doomed carrying brazilian team reportedly fuel...,87,504,0,2,32,Negative,24,83,0,91,51,5,202,24,121,3809,4


In [63]:
#df3['cleaned_article'] = df3['cleaned_article'].apply(lambda x : " ".join(x))
#df3.head(2)

In [64]:
feature_engineering(df3)

Polarity


  0%|          | 0/30938 [00:00<?, ?it/s]

Word Count


  0%|          | 0/30938 [00:00<?, ?it/s]

Average Word Count


  0%|          | 0/30938 [00:00<?, ?it/s]

Number of Unique Words


  0%|          | 0/30938 [00:00<?, ?it/s]

Number of Chars


  0%|          | 0/30938 [00:00<?, ?it/s]

Unnamed: 0,label,cleaned_article,num_punctuations,no_of_stopwords,no_of_quesMarks,no_of_exclamation,no_of_sentence,sentiment,cc_cd,determiner,...,noun,adverb,verb,article_len,negations,polarity,word_count,avg_word_len,num_unique_words,num_chars
0,0,news release unemployment rise wisconsin paul ...,1,18,0,0,1,Negative,0,2,...,10,1,3,131,0,0.000000,11,7.090909,11,88
1,0,black turnout soft early voting boding hillary...,47,261,0,0,11,Positive,17,25,...,103,15,48,1869,1,-0.023781,163,6.705521,116,1255
2,0,television interview milwaukee county safe year,1,14,0,0,1,Negative,2,3,...,6,3,2,94,1,0.500000,6,7.000000,6,47
3,0,thing learned general contractor foundation li...,258,1242,2,0,80,Positive,131,209,...,476,97,423,10541,19,0.176372,678,6.328909,391,4968
4,1,statement responding rick scott state state sp...,4,11,0,0,3,Neutral,1,2,...,8,1,7,126,0,0.000000,12,6.166667,9,85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30933,1,richardson passed nation giving national guard...,4,22,0,0,2,Positive,0,3,...,10,0,3,108,0,0.000000,9,6.777778,9,69
30934,0,khodorkovsky putin going cozy washington print...,121,553,11,0,38,Positive,19,62,...,184,46,161,4216,19,0.033517,302,6.801325,172,2355
30935,0,coup stolen election rumor coup watching mediu...,74,378,7,6,37,Negative,18,56,...,164,25,130,3203,5,-0.012531,250,6.420000,174,1854
30936,1,doomed carrying brazilian team reportedly fuel...,87,504,0,2,32,Negative,24,83,...,202,24,121,3809,4,-0.020977,300,6.740000,213,2321


In [65]:
df3.drop(df3[df3['article_len'] == 0].index, inplace = True)

In [66]:
#drop_noword_article(df3)

In [67]:
df3[df3['article_len'] == 0]

Unnamed: 0,label,cleaned_article,num_punctuations,no_of_stopwords,no_of_quesMarks,no_of_exclamation,no_of_sentence,sentiment,cc_cd,determiner,...,noun,adverb,verb,article_len,negations,polarity,word_count,avg_word_len,num_unique_words,num_chars


In [68]:
df3.to_csv("Cleaned/cleaned_data_3.csv", index = False)

In [69]:
df3.columns

Index(['label', 'cleaned_article', 'num_punctuations', 'no_of_stopwords',
       'no_of_quesMarks', 'no_of_exclamation', 'no_of_sentence', 'sentiment',
       'cc_cd', 'determiner', 'foreign_w', 'conj_prep', 'adjective', 'modal',
       'noun', 'adverb', 'verb', 'article_len', 'negations', 'polarity',
       'word_count', 'avg_word_len', 'num_unique_words', 'num_chars'],
      dtype='object')

## Dataset 4    (Test 1)

In [25]:
df4 = pd.read_csv("Test/data1.csv")
df4.head(2)

Unnamed: 0,id,title,author,text,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,1


In [26]:
df4.isna().sum()

id          0
title     122
author    503
text        7
label       0
dtype: int64

In [27]:
# As per source label 1 represent fake whereas label 0 represents Real news
# Converting 0 to 1 and 1 to 0 as per our problem definition 
df4.label.replace({0:1, 1:0}, inplace = True)

In [28]:
df4['label'].value_counts()

0    2861
1    2339
Name: label, dtype: int64

In [29]:
df4.fillna(' ', inplace = True)
df4['cleaned_article'] = df4['title'] + " " + df4['text']

In [30]:
df4.drop(['id', 'title', 'author', 'text'], axis = 1 , inplace = True)

In [31]:
df4.isna().sum()

label              0
cleaned_article    0
dtype: int64

In [32]:
df4 = data_cleaning(df4)
df4.head()

Cleaning......


  0%|          | 0/5200 [00:00<?, ?it/s]

  0%|          | 0/5200 [00:00<?, ?it/s]

  0%|          | 0/5200 [00:00<?, ?it/s]

Pre-Featurization
Number of punctuations


  0%|          | 0/5200 [00:00<?, ?it/s]

No of Stopwords


  0%|          | 0/5200 [00:00<?, ?it/s]

No of question marks


  0%|          | 0/5200 [00:00<?, ?it/s]

Number of Exclamation Marks


  0%|          | 0/5200 [00:00<?, ?it/s]

No of Sentences


  0%|          | 0/5200 [00:00<?, ?it/s]

Sentiment Analysis


  0%|          | 0/5200 [00:00<?, ?it/s]

POS TAGGING


100%|██████████████████████████████████████████████████████████████████████████████| 5200/5200 [02:15<00:00, 38.45it/s]

Cleaning......
Special Char





  0%|          | 0/5200 [00:00<?, ?it/s]

Article Length


  0%|          | 0/5200 [00:00<?, ?it/s]

Lemmetization


  0%|          | 0/5200 [00:00<?, ?it/s]

Negative Words...


  0%|          | 0/5200 [00:00<?, ?it/s]

Preprocessing


  0%|          | 0/5200 [00:00<?, ?it/s]

Unnamed: 0,label,cleaned_article,num_punctuations,no_of_stopwords,no_of_quesMarks,no_of_exclamation,no_of_sentence,sentiment,cc_cd,determiner,foreign_w,conj_prep,adjective,modal,noun,adverb,verb,article_len,negations
0,1,specter trump loosens tongue purse string sili...,212,999,0,0,75,Positive,56,115,2,139,146,11,396,61,265,7680,15
1,0,russian warship ready strike terrorist near al...,43,234,0,0,10,Negative,7,27,0,30,34,4,83,3,38,1564,1
2,1,nodapl native american leader stay winter file...,96,573,0,4,32,Negative,38,50,0,83,83,8,235,22,138,4390,1
3,0,tebow attempt comeback time baseball york time...,98,438,1,0,28,Positive,42,63,0,82,66,11,215,15,104,3448,4
4,0,keiser report meme view comment like time hist...,16,64,0,0,5,Positive,7,6,0,10,10,1,34,5,15,524,1


In [33]:
#df4['cleaned_article'] = df4['cleaned_article'].apply(lambda x : " ".join(x))

In [34]:
feature_engineering(df4)

Polarity


  0%|          | 0/5200 [00:00<?, ?it/s]

Word Count


  0%|          | 0/5200 [00:00<?, ?it/s]

Average Word Count


  0%|          | 0/5200 [00:00<?, ?it/s]

Number of Unique Words


  0%|          | 0/5200 [00:00<?, ?it/s]

Number of Chars


  0%|          | 0/5200 [00:00<?, ?it/s]

Unnamed: 0,label,cleaned_article,num_punctuations,no_of_stopwords,no_of_quesMarks,no_of_exclamation,no_of_sentence,sentiment,cc_cd,determiner,...,noun,adverb,verb,article_len,negations,polarity,word_count,avg_word_len,num_unique_words,num_chars
0,1,specter trump loosens tongue purse string sili...,212,999,0,0,75,Positive,56,115,...,396,61,265,7680,15,0.053472,623,6.778491,409,4845
1,0,russian warship ready strike terrorist near al...,43,234,0,0,10,Negative,7,27,...,83,3,38,1564,1,0.074048,132,7.015152,79,1057
2,1,nodapl native american leader stay winter file...,96,573,0,4,32,Negative,38,50,...,235,22,138,4390,1,-0.003599,373,6.863271,225,2932
3,0,tebow attempt comeback time baseball york time...,98,438,1,0,28,Positive,42,63,...,215,15,104,3448,4,0.168915,274,6.186131,191,1968
4,0,keiser report meme view comment like time hist...,16,64,0,0,5,Positive,7,6,...,34,5,15,524,1,0.084167,44,6.250000,39,318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5195,1,bangladeshi traffic york time dysfunction plag...,445,2440,1,4,174,Negative,159,352,...,939,139,527,17712,23,0.009160,1439,6.596247,920,10930
5196,0,john kasich sign abortion ohio veto restrictiv...,72,462,0,0,27,Negative,31,70,...,196,22,96,3353,5,0.129097,266,6.586466,174,2017
5197,1,california today exactly sushi york time good ...,159,746,4,0,65,Negative,43,89,...,323,34,170,5498,1,-0.012897,469,6.526652,376,3529
5198,0,marine deployed russian border norway previous...,31,325,2,0,18,Negative,8,56,...,105,29,97,2640,6,-0.049270,192,6.520833,131,1443


In [39]:
drop_noword_article(df4)

Unnamed: 0,label,cleaned_article,num_punctuations,no_of_stopwords,no_of_quesMarks,no_of_exclamation,no_of_sentence,sentiment,cc_cd,determiner,...,noun,adverb,verb,article_len,negations,polarity,word_count,avg_word_len,num_unique_words,num_chars
0,1,specter trump loosens tongue purse string sili...,212,999,0,0,75,Positive,56,115,...,396,61,265,7680,15,0.053472,623,6.778491,409,4845
1,0,russian warship ready strike terrorist near al...,43,234,0,0,10,Negative,7,27,...,83,3,38,1564,1,0.074048,132,7.015152,79,1057
2,1,nodapl native american leader stay winter file...,96,573,0,4,32,Negative,38,50,...,235,22,138,4390,1,-0.003599,373,6.863271,225,2932
3,0,tebow attempt comeback time baseball york time...,98,438,1,0,28,Positive,42,63,...,215,15,104,3448,4,0.168915,274,6.186131,191,1968
4,0,keiser report meme view comment like time hist...,16,64,0,0,5,Positive,7,6,...,34,5,15,524,1,0.084167,44,6.250000,39,318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5195,1,bangladeshi traffic york time dysfunction plag...,445,2440,1,4,174,Negative,159,352,...,939,139,527,17712,23,0.009160,1439,6.596247,920,10930
5196,0,john kasich sign abortion ohio veto restrictiv...,72,462,0,0,27,Negative,31,70,...,196,22,96,3353,5,0.129097,266,6.586466,174,2017
5197,1,california today exactly sushi york time good ...,159,746,4,0,65,Negative,43,89,...,323,34,170,5498,1,-0.012897,469,6.526652,376,3529
5198,0,marine deployed russian border norway previous...,31,325,2,0,18,Negative,8,56,...,105,29,97,2640,6,-0.049270,192,6.520833,131,1443


In [40]:
df4.to_csv("Test/test1.csv", index = False)

## Dataset 5     (Test 2)

In [82]:
df5 = pd.read_csv('Test/news.csv', usecols=['title', 'text', 'label'])
df5

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [83]:
df5.isna().sum()

title    0
text     0
label    0
dtype: int64

In [84]:
df5['cleaned_article'] = df5['title'] + " " + df5['text']

In [85]:
# Converting label into boolean value

df5['label'].replace(('REAL', 'FAKE'), (1, 0), inplace=True)
#or 
# df5.label.replace({"REAL":1, "FAKE":0}, inplace = True)

In [86]:
df5.head()

Unnamed: 0,title,text,label,cleaned_article
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,You Can Smell Hillary’s Fear Daniel Greenfield...
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,Watch The Exact Moment Paul Ryan Committed Pol...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,Kerry to go to Paris in gesture of sympathy U....
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,Bernie supporters on Twitter erupt in anger ag...
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1,The Battle of New York: Why This Primary Matte...


In [87]:
df5.drop(['title','text'], axis = 1 , inplace = True)

In [88]:
df5['label'].value_counts()

1    3171
0    3164
Name: label, dtype: int64

In [89]:
df5 = data_cleaning(df5)

Cleaning......


  0%|          | 0/6335 [00:00<?, ?it/s]

  0%|          | 0/6335 [00:00<?, ?it/s]

  0%|          | 0/6335 [00:00<?, ?it/s]

Pre-Featurization
Number of punctuations


  0%|          | 0/6335 [00:00<?, ?it/s]

No of Stopwords


  0%|          | 0/6335 [00:00<?, ?it/s]

No of question marks


  0%|          | 0/6335 [00:00<?, ?it/s]

Number of Exclamation Marks


  0%|          | 0/6335 [00:00<?, ?it/s]

No of Sentences


  0%|          | 0/6335 [00:00<?, ?it/s]

Sentiment Analysis


  0%|          | 0/6335 [00:00<?, ?it/s]

POS TAGGING


100%|██████████████████████████████████████████████████████████████████████████████| 6335/6335 [03:18<00:00, 31.90it/s]

Cleaning......
Special Char





  0%|          | 0/6335 [00:00<?, ?it/s]

Article Length


  0%|          | 0/6335 [00:00<?, ?it/s]

Lemmetization


  0%|          | 0/6335 [00:00<?, ?it/s]

Negative Words...


  0%|          | 0/6335 [00:00<?, ?it/s]

Preprocessing


  0%|          | 0/6335 [00:00<?, ?it/s]

In [90]:
#df5['cleaned_article'] = df5['cleaned_article'].apply(lambda x : " ".join(x))

In [91]:
feature_engineering(df5)

Polarity


  0%|          | 0/6335 [00:00<?, ?it/s]

Word Count


  0%|          | 0/6335 [00:00<?, ?it/s]

Average Word Count


  0%|          | 0/6335 [00:00<?, ?it/s]

Number of Unique Words


  0%|          | 0/6335 [00:00<?, ?it/s]

Number of Chars


  0%|          | 0/6335 [00:00<?, ?it/s]

Unnamed: 0,label,cleaned_article,num_punctuations,no_of_stopwords,no_of_quesMarks,no_of_exclamation,no_of_sentence,sentiment,cc_cd,determiner,...,noun,adverb,verb,article_len,negations,polarity,word_count,avg_word_len,num_unique_words,num_chars
0,0,smell hillary fear daniel greenfield shillman ...,107,1061,3,0,87,Negative,51,158,...,339,74,253,7364,9,0.029606,544,6.755515,373,4218
1,0,watch exact moment paul ryan committed politic...,60,331,0,0,25,Positive,20,44,...,132,23,82,2584,3,0.028472,208,6.658654,172,1592
2,1,kerry paris gesture sympathy secretary state j...,43,358,0,0,16,Positive,12,38,...,131,12,77,2520,6,0.018500,217,6.460829,154,1618
3,0,bernie supporter twitter erupt anger tried war...,76,337,0,2,17,Negative,28,39,...,122,20,65,2545,3,-0.006044,214,7.186916,161,1751
4,1,battle york primary matter primary york frontr...,53,210,0,2,21,Positive,21,34,...,94,11,67,1833,2,0.111458,144,6.284722,91,1048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6330,1,state department email clinton specialist stat...,78,569,0,1,27,Negative,23,62,...,201,24,145,4023,7,-0.027560,326,6.963190,152,2595
6331,0,stand plutocratic pentagon stand plutocratic p...,383,1929,1,0,48,Negative,127,213,...,646,80,267,13722,12,0.028696,1073,7.558248,739,9182
6332,0,antitrump protester tool oligarchy information...,214,1476,6,0,100,Negative,75,237,...,569,55,358,11769,13,0.000110,912,7.162281,531,7443
6333,1,ethiopia obama seek progress peace security ea...,157,981,0,0,40,Positive,57,109,...,323,31,188,6764,4,0.055563,562,6.925267,361,4453


In [92]:
#drop_noword_article(df5)

In [93]:
df5.cleaned_article[0]



In [94]:
df5.to_csv("Test/test2.csv", index = False)

### Combining train data into one dataset for model training

In [3]:
df123 = pd.concat([df1, df2, df3], axis = 0) # Concatinating first three datasets for training purpose
df123 = df123.sample(frac = 1) # Mixing/sampling the dataset
df123.reset_index(inplace = True, drop = True) # Resetting index number
df123.dropna(inplace = True)
df123.to_csv('training_set.csv', index = False)