##### Import necessary ibraries

In [1]:
from utils import pickle_to, pickle_from, ignore_warnings
import pandas as pd
import pickle
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
pd.set_option('display.max_colwidth', -1)


# Scrub data
###### Encoding newly created variable names into 0 or 1

Encoding | Label
-- | --
0 | FAKE
1 | REAL



In [2]:
def encode_label(label):
    """Encode real as 1, fake as 0, everything else as None"""
    if label == "REAL":
        return 1
    elif label == "FAKE":
        return 0
    else:
        return None

In [3]:
# Load raw data
raw = pickle_from('../data/raw/raw.pkl')



Loaded file from ../data/raw/raw.pkl


In [None]:
# Apply label encoding function to raw data

raw['numeric_label'] = raw['label'].apply(encode_label)
raw.head()

In [None]:
raw[raw.text.str.contains('turtle')]

In [None]:
# Removing rows that have the string 'sweeping consequences'
# These have been observed to be duplicates of a single line

raw = raw[~raw['text'].str.contains('sweeping consequence')]




In [None]:
# Removing rows that have the string 'automatically' 
# as this is part of html code

raw = raw[~raw['text'].str.contains('automatically')]
raw = raw.drop(220).reset_index()
raw = raw.drop('index',axis = 1)





In [None]:
# Dropping null values
data = raw[['title','text','numeric_label']].dropna()


In [None]:
data.info()


##### Combining text and title data into a single column by name 'news'

In [None]:
data['news'] = data['title'] + '. '  + data['text']

In [None]:
data.head()

## Cleaning text data

Involves the following <br>

* converting everyting to lowercase
* removing punctuations
* removing numbers
* removing non english words


In [None]:
# Converting to lowercase
data.text = data.text.apply(lambda x:x.lower())
data['text'] = data['text'].str.replace("’","'")


#Removing all punctuations
data.text = data.text.str.replace('[^\w\s]','')

# Removing numbers
data.text = data.text.str.replace('\d+', ' ')

# Making sure any double-spaces are single
data.text = data.text.str.replace('  ',' ')


In [None]:
data.sample(5)



In [None]:
text_data = data[['numeric_label','text']].copy()



In [None]:
text_data.info()

In [None]:
text_data.head(1)

### Tokenize & Lemmatize

In [None]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]



In [None]:
text_data['tokenized'] = text_data.text.apply(lemmatize_text)


In [None]:
text_data.head(1)


### Remove english stopwords

In [None]:
stop_words = stopwords.words('english')


In [None]:
stop_words.append('tweet')
stop_words.append('home')
stop_words.append('headlines')
stop_words.append('finance')
stop_words.append('news')
stop_words.append('was')
stop_words.append('has')
stop_words.append('said')
stop_words.append('wa')
stop_words.append('ha')
stop_words.append('leave')
stop_words.append('comment')
stop_words.append('loading')
stop_words.append('eaten')
stop_words.append('matrix')
stop_words.append('extraterrestrial')
stop_words.append('wi')
stop_words.append('ivt')



In [None]:
pickle_to(stop_words,'stop_words.pkl')

In [None]:
text_data['tokenized'] = text_data['tokenized'].apply(lambda x: [item for item in x if item not in stop_words])

# Remove words that have less than 3 characters
text_data['text'] = text_data['text'].str.replace(r'\b(\w{1,2})\b', ' ')

In [None]:
# Create a new column that has the length of each text

text_data['token_length'] = text_data.apply(lambda row: len(row['tokenized']), axis=1)


In [None]:
text_data[text_data['text'].str.contains('phrase block')]


In [None]:
text_data[text_data.text.str.contains('automatically')].head()


In [None]:
text_data.head()


### Pickling

In [None]:
# Pickling

pickle_to(text_data,'text_data.pkl')
pickle_to(data,'data.pkl')



In [None]:
raw[raw['label'] == 'REAL']

In [None]:
raw = pickle_from('raw.pkl')

In [None]:
raw[raw['label'] == 'FAKE']