In [3]:
#Import Dependencies
import pandas as pd
import numpy as np
import nltk
import string
import regex as re
import contractions

In [4]:
#Load Resources into Pandas Dataframes
fake_df = pd.read_csv("../Resources/Fake.csv")
real_df = pd.read_csv("../Resources/True.csv")
news_df = pd.read_csv("../Resources/news.csv")

In [5]:
#Add label values for each dataset
fake_df['target'] = 1
real_df['target'] = 0
target_list = []

for i in news_df['label']:
    if i == "FAKE":
        target_list.append(1)
    else:
        target_list.append(0)

news_df['target'] = target_list

In [7]:
news_df.columns

Index(['Unnamed: 0', 'title', 'text', 'label', 'target'], dtype='object')

In [8]:
fake_df.columns

Index(['title', 'text', 'subject', 'date', 'target'], dtype='object')

In [9]:
real_df.columns

Index(['title', 'text', 'subject', 'date', 'target'], dtype='object')

In [10]:
#Drop extra columns
news_df.drop(columns=['Unnamed: 0','label'], inplace=True)
fake_df.drop(columns=['subject','date'], inplace=True)
real_df.drop(columns=['subject','date'], inplace=True)

In [11]:
#Combine datasets
fake_news_df = pd.concat([news_df, fake_df, real_df], ignore_index=True)

In [12]:
fake_news_df.head()

Unnamed: 0,title,text,target
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0


In [13]:
fake_news_df.tail()

Unnamed: 0,title,text,target
51228,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,0
51229,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",0
51230,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,0
51231,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,0
51232,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,0


In [14]:
fake_news_df['target'].value_counts()

1    26645
0    24588
Name: target, dtype: int64

In [15]:
fake_news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51233 entries, 0 to 51232
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   51233 non-null  object
 1   text    51233 non-null  object
 2   target  51233 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [16]:
##Source: https://towardsdatascience.com/primer-to-cleaning-text-data-7e856d6e5791

stopword_list = nltk.corpus.stopwords.words('english')
#Define function for removing stopwords
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer( )
def remove_stopwords(text, is_lower_case=False):
     tokens = tokenizer.tokenize(text)
     tokens = [token.strip( ) for token in tokens] # List comprehension: loop through every token and strip white space
     filtered_tokens = [token for token in tokens if token not in stopword_list] # Keep only the non stop word tokens in the list
     filtered_text = ' '.join(filtered_tokens) # join all those tokens using a space as a delimiter
    
     return filtered_text

In [17]:
#List to store cleaned text strings
cleaned_text_list = []
#Clean texts by removing contractions, lowercasing, removing punctuation and special characters, and removing stopwords
for text in fake_news_df['text']:
    text = contractions.fix(text)
    text = text.lower()
    text = re.sub('[^a-zA-z0-9\s]', '' , text)
    text = text.translate(str.maketrans('','',string.punctuation))
    filtered = remove_stopwords(text)
    cleaned_text_list.append(filtered)


In [18]:
#List to store cleaned title strings
cleaned_title_list = []
#Clean titles by removing contractions, lowercasing, removing punctuation and special characters, and removing stopwords
for title in fake_news_df['title']:
    title = contractions.fix(title)
    title = title.lower()
    title = re.sub('[^a-zA-z0-9\s]', '' , title)
    title = title.translate(str.maketrans('','',string.punctuation))
    filtered = remove_stopwords(title)
    cleaned_title_list.append(filtered)


In [19]:
#Create columns for cleaned titles and texts
fake_news_df['cleaned_title'] = cleaned_title_list
fake_news_df['cleaned_text'] = cleaned_text_list
fake_news_df.head()

Unnamed: 0,title,text,target,cleaned_title,cleaned_text
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1,smell hillarys fear,daniel greenfield shillman journalism fellow f...
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1,watch exact moment paul ryan committed politic...,google pinterest digg linkedin reddit stumbleu...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0,kerry go paris gesture sympathy,yous secretary state john f kerry said monday ...
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1,bernie supporters twitter erupt anger dnc trie...,kaydee king kaydeeking november 9 2016 lesson ...
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0,battle new york primary matters,primary day new york frontrunners hillary clin...


In [21]:
#Create a Dataframe of the cleaned columns and label values
clean_df = fake_news_df[['cleaned_title','cleaned_text','target']]

#Export to CSV for use in classification models
clean_df.to_csv("../Resources/expanded_clean_dataset.csv")