In [1]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import random

In [2]:
df = pd.read_csv("data/fakenews.csv")
df

Unnamed: 0,title,Target
0,Trump Will ANNHILILATE The Working Class He’s...,Fake
1,Yemen Houthis say have shot down U.S. surveill...,True
2,BREAKING NEWS: TRUMP ADMIN ANNOUNCES “MASSIVE ...,Fake
3,U.S. Commerce Secretary wants NAFTA autos cont...,True
4,U.S. warns of repercussions for Pakistan over ...,True
...,...,...
44893,Germany’s Angela Merkel Makes Incredibly Naive...,Fake
44894,MEDIA IGNORES! HOUSE DEM IT SUSPECTS Wanted Un...,Fake
44895,Obama administration completes rule to curb me...,True
44896,Meeting between Egyptian foreign minister and ...,True


In [4]:
df['tokens'] = df['title'].apply(nltk.word_tokenize)
df

Unnamed: 0,title,Target,tokens
0,Trump Will ANNHILILATE The Working Class He’s...,Fake,"[Trump, Will, ANNHILILATE, The, Working, Class..."
1,Yemen Houthis say have shot down U.S. surveill...,True,"[Yemen, Houthis, say, have, shot, down, U.S., ..."
2,BREAKING NEWS: TRUMP ADMIN ANNOUNCES “MASSIVE ...,Fake,"[BREAKING, NEWS, :, TRUMP, ADMIN, ANNOUNCES, “..."
3,U.S. Commerce Secretary wants NAFTA autos cont...,True,"[U.S., Commerce, Secretary, wants, NAFTA, auto..."
4,U.S. warns of repercussions for Pakistan over ...,True,"[U.S., warns, of, repercussions, for, Pakistan..."
...,...,...,...
44893,Germany’s Angela Merkel Makes Incredibly Naive...,Fake,"[Germany, ’, s, Angela, Merkel, Makes, Incredi..."
44894,MEDIA IGNORES! HOUSE DEM IT SUSPECTS Wanted Un...,Fake,"[MEDIA, IGNORES, !, HOUSE, DEM, IT, SUSPECTS, ..."
44895,Obama administration completes rule to curb me...,True,"[Obama, administration, completes, rule, to, c..."
44896,Meeting between Egyptian foreign minister and ...,True,"[Meeting, between, Egyptian, foreign, minister..."


In [5]:
from datetime import date
from pprint import pprint

In [6]:
from marshmallow import Schema, fields, validate, ValidationError

## Make a validation method using marshmallow

The validation method will be based upon the known words to the data as well as length of the text and score it accordingly to validate the data

In [7]:
tokenized_words = list()

for i in df['tokens']:
    for j in i:
        tokenized_words.append(j.lower())

len(tokenized_words)
# print(tokenized_words)

657133

In [8]:
tokenized_words = set(tokenized_words)
len(tokenized_words)

27023

In [9]:
def validateWords(title: str):
    tokenized_title = nltk.word_tokenize(title)
    tokenized_title = [x.lower() for x in tokenized_title]
    unknown_words = [t for t in tokenized_title if t not in tokenized_words]
    validation_score = (len(tokenized_title) - len(unknown_words))/len(tokenized_title)
    # print(f"The unknown words in the title are {unknown_words}")

    if validation_score < 0.1 and len(unknown_words) > 0:
        # Uncomment the line below to create a validation error
        # raise ValidationError(f"Too many unknown words: {unknown_words}")
        # print(f"Too many unknown words: {unknown_words}")
        return False
    else:
        # print(f'This title is accepted on the basis of our existing data')
        return True


In [10]:
print(validateWords("mentos demn bruv"))

#* It scores according to the existing data
print(validateWords("Donald"))

False
True


In [11]:
testTitle = df['title'][65]
testTitle

'Son-in-law Kushner poised to wield clout in Trump presidency'

In [12]:
print(validateWords(testTitle))

True


In [13]:
validationList = []
declinedCount = 0

for i in df['title']:
    if validateWords(i):
        validationList.append('Accepted')
    else:
        validationList.append('Declined')
        declinedCount += 0

#! The declinedCount should be 0 because the validation function is based on the existing title data
declinedCount

0

In [14]:
df['Title Validation'] = validationList
df

Unnamed: 0,title,Target,tokens,Title Validation
0,Trump Will ANNHILILATE The Working Class He’s...,Fake,"[Trump, Will, ANNHILILATE, The, Working, Class...",Accepted
1,Yemen Houthis say have shot down U.S. surveill...,True,"[Yemen, Houthis, say, have, shot, down, U.S., ...",Accepted
2,BREAKING NEWS: TRUMP ADMIN ANNOUNCES “MASSIVE ...,Fake,"[BREAKING, NEWS, :, TRUMP, ADMIN, ANNOUNCES, “...",Accepted
3,U.S. Commerce Secretary wants NAFTA autos cont...,True,"[U.S., Commerce, Secretary, wants, NAFTA, auto...",Accepted
4,U.S. warns of repercussions for Pakistan over ...,True,"[U.S., warns, of, repercussions, for, Pakistan...",Accepted
...,...,...,...,...
44893,Germany’s Angela Merkel Makes Incredibly Naive...,Fake,"[Germany, ’, s, Angela, Merkel, Makes, Incredi...",Accepted
44894,MEDIA IGNORES! HOUSE DEM IT SUSPECTS Wanted Un...,Fake,"[MEDIA, IGNORES, !, HOUSE, DEM, IT, SUSPECTS, ...",Accepted
44895,Obama administration completes rule to curb me...,True,"[Obama, administration, completes, rule, to, c...",Accepted
44896,Meeting between Egyptian foreign minister and ...,True,"[Meeting, between, Egyptian, foreign, minister...",Accepted


In [15]:
print(validateWords("Gujarati woman leaves Islam and gifts Rs 12 lakh crown to Shivji’s statue? "))
#* Testing the function with a new title from dfrac

True


## Testing with new data by scraping from dfrac

In [16]:
from bs4 import BeautifulSoup
import requests

### Webscraping dfrac.org using WebScraping

In [17]:
text_list = [] # List to store the news titles

# Going through the page
for n in range(0, 83): # Loop is for going through the pages 159 pages of data were available
    pg = requests.get(f"https://dfrac.org/en/topic/fake/page/{n+1}/")
    cnt = pg.content

    soup = BeautifulSoup(cnt, 'html.parser')

    headline = soup.find_all('div', class_= 'read-title')
    for i in headline:
        title = i.find('h4').get_text()
        title = title.strip()
        # Cleaning the data (Removing Unnecessary Parts)
        title = title.replace("- Read Fact Check", "").replace(" Read- Fact Check", "").replace("Read, Fact-Check", "").replace("Fact Check: ", "").replace(" Read Fact Check", "").replace("Fact Check-", "").replace('FactCheck:', '').replace('Fact-Check: ', '')
        title = title.strip()
        text_list.append(title)


data = {
    'Headline' : text_list,
    'Target' : ["false"] * len(text_list),
}

# Making a dataframe of the data
df2 = pd.DataFrame(data)
df2 = df2.drop_duplicates()

#! Need to update the web scraping code

In [18]:
df2

Unnamed: 0,Headline,Target
0,Did Journalist Ajit Anjum decide to leave jour...,false
1,Is Ram Navami not a public holiday in India? k...,false
2,"Manish Kashyap be released soon,Madurai court ...",false
3,Was the first iftar party of independent India...,false
4,Did Rahul Gandhi visit Cambridge University as...,false
...,...,...
1645,What Nehru inherited from the British?,false
1646,Filmmaker Vivek Agnihotri posts fake quote by ...,false
1647,New media Twitter account with 1.5 Lakh follow...,false
1648,Consulting Editor of News Nation TV posts a do...,false


In [19]:
newTitleList = []

for i in df2['Headline']:
    if i not in df['title']:
        newTitleList.append(i)

print(f"The lengths of:\nOld Data = {len(df['title'])}\nCollected Data = {len(df2['Headline'])}\nNew Data = {len(newTitleList)}")

The lengths of:
Old Data = 44898
Collected Data = 832
New Data = 832


In [20]:
newValidationList = []

for i in newTitleList:
    if validateWords(i):
        newValidationList.append('Accepted')
    else:
        newValidationList.append('Declined')

In [21]:
if 'Declined' in newValidationList:
    print('There are some titles that have been declined')
    #TODO Need to add code here for getting declined title indexes 
else:
    print('All of the titles have been accepted')

There are some titles that have been declined


In [22]:
df

Unnamed: 0,title,Target,tokens,Title Validation
0,Trump Will ANNHILILATE The Working Class He’s...,Fake,"[Trump, Will, ANNHILILATE, The, Working, Class...",Accepted
1,Yemen Houthis say have shot down U.S. surveill...,True,"[Yemen, Houthis, say, have, shot, down, U.S., ...",Accepted
2,BREAKING NEWS: TRUMP ADMIN ANNOUNCES “MASSIVE ...,Fake,"[BREAKING, NEWS, :, TRUMP, ADMIN, ANNOUNCES, “...",Accepted
3,U.S. Commerce Secretary wants NAFTA autos cont...,True,"[U.S., Commerce, Secretary, wants, NAFTA, auto...",Accepted
4,U.S. warns of repercussions for Pakistan over ...,True,"[U.S., warns, of, repercussions, for, Pakistan...",Accepted
...,...,...,...,...
44893,Germany’s Angela Merkel Makes Incredibly Naive...,Fake,"[Germany, ’, s, Angela, Merkel, Makes, Incredi...",Accepted
44894,MEDIA IGNORES! HOUSE DEM IT SUSPECTS Wanted Un...,Fake,"[MEDIA, IGNORES, !, HOUSE, DEM, IT, SUSPECTS, ...",Accepted
44895,Obama administration completes rule to curb me...,True,"[Obama, administration, completes, rule, to, c...",Accepted
44896,Meeting between Egyptian foreign minister and ...,True,"[Meeting, between, Egyptian, foreign, minister...",Accepted


### Converting the freshly scraped data into our needed Dataframe model to concatenate to the older one

In [23]:
new_df = pd.DataFrame(columns=['title','Target','tokens','Title Validation'])
new_df

Unnamed: 0,title,Target,tokens,Title Validation


In [24]:
new_df['title'] = df2['Headline']
new_df['Target'] = 'Fake'
new_df

Unnamed: 0,title,Target,tokens,Title Validation
0,Did Journalist Ajit Anjum decide to leave jour...,Fake,,
1,Is Ram Navami not a public holiday in India? k...,Fake,,
2,"Manish Kashyap be released soon,Madurai court ...",Fake,,
3,Was the first iftar party of independent India...,Fake,,
4,Did Rahul Gandhi visit Cambridge University as...,Fake,,
...,...,...,...,...
1645,What Nehru inherited from the British?,Fake,,
1646,Filmmaker Vivek Agnihotri posts fake quote by ...,Fake,,
1647,New media Twitter account with 1.5 Lakh follow...,Fake,,
1648,Consulting Editor of News Nation TV posts a do...,Fake,,


In [25]:
new_df['tokens'] = new_df['title'].apply(nltk.word_tokenize)
new_df['Title Validation'] = newValidationList
new_df

Unnamed: 0,title,Target,tokens,Title Validation
0,Did Journalist Ajit Anjum decide to leave jour...,Fake,"[Did, Journalist, Ajit, Anjum, decide, to, lea...",Accepted
1,Is Ram Navami not a public holiday in India? k...,Fake,"[Is, Ram, Navami, not, a, public, holiday, in,...",Accepted
2,"Manish Kashyap be released soon,Madurai court ...",Fake,"[Manish, Kashyap, be, released, soon, ,, Madur...",Accepted
3,Was the first iftar party of independent India...,Fake,"[Was, the, first, iftar, party, of, independen...",Accepted
4,Did Rahul Gandhi visit Cambridge University as...,Fake,"[Did, Rahul, Gandhi, visit, Cambridge, Univers...",Accepted
...,...,...,...,...
1645,What Nehru inherited from the British?,Fake,"[What, Nehru, inherited, from, the, British, ?]",Accepted
1646,Filmmaker Vivek Agnihotri posts fake quote by ...,Fake,"[Filmmaker, Vivek, Agnihotri, posts, fake, quo...",Accepted
1647,New media Twitter account with 1.5 Lakh follow...,Fake,"[New, media, Twitter, account, with, 1.5, Lakh...",Accepted
1648,Consulting Editor of News Nation TV posts a do...,Fake,"[Consulting, Editor, of, News, Nation, TV, pos...",Accepted


In [26]:
len(new_df)

832

Combined both old and new dataframes into one

In [27]:
df_frames = [df, new_df]
df = pd.concat(df_frames)
df

Unnamed: 0,title,Target,tokens,Title Validation
0,Trump Will ANNHILILATE The Working Class He’s...,Fake,"[Trump, Will, ANNHILILATE, The, Working, Class...",Accepted
1,Yemen Houthis say have shot down U.S. surveill...,True,"[Yemen, Houthis, say, have, shot, down, U.S., ...",Accepted
2,BREAKING NEWS: TRUMP ADMIN ANNOUNCES “MASSIVE ...,Fake,"[BREAKING, NEWS, :, TRUMP, ADMIN, ANNOUNCES, “...",Accepted
3,U.S. Commerce Secretary wants NAFTA autos cont...,True,"[U.S., Commerce, Secretary, wants, NAFTA, auto...",Accepted
4,U.S. warns of repercussions for Pakistan over ...,True,"[U.S., warns, of, repercussions, for, Pakistan...",Accepted
...,...,...,...,...
1645,What Nehru inherited from the British?,Fake,"[What, Nehru, inherited, from, the, British, ?]",Accepted
1646,Filmmaker Vivek Agnihotri posts fake quote by ...,Fake,"[Filmmaker, Vivek, Agnihotri, posts, fake, quo...",Accepted
1647,New media Twitter account with 1.5 Lakh follow...,Fake,"[New, media, Twitter, account, with, 1.5, Lakh...",Accepted
1648,Consulting Editor of News Nation TV posts a do...,Fake,"[Consulting, Editor, of, News, Nation, TV, pos...",Accepted


### Making CSV file from the dataframe

In [28]:
df.to_csv("./data/updated_data.csv")