In [14]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import random

## Load data

In [2]:
TRAIN_CSV_PATH = "./data/train.csv"
data = pd.read_csv(TRAIN_CSV_PATH, index_col=False)
data.head(3)

Unnamed: 0,Statement,Label
0,Says the Annies List political group supports ...,False
1,When did the decline of coal start? It started...,True
2,"Hillary Clinton agrees with John McCain ""by vo...",True


## Clean digits, punctuation and symbols

In [3]:
def clean_text(x):
    text = ''.join([i for i in x if not i.isdigit()])
    text = re.sub("[!\?@^&.,/#$+%*:()'\"-]", ' ', text)
    return text

data['cleaned_statement'] = data.loc[:,'Statement'].apply(clean_text)
data.head(3)

Unnamed: 0,Statement,Label,cleaned_statement
0,Says the Annies List political group supports ...,False,Says the Annies List political group supports ...
1,When did the decline of coal start? It started...,True,When did the decline of coal start It started...
2,"Hillary Clinton agrees with John McCain ""by vo...",True,Hillary Clinton agrees with John McCain by vo...


## Download NLTK Stopwords model

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/louis/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Clean stopwords and lowercase

In [6]:
def clean_stopwords(x):
    text = x.split() 
    filtered_words = [word.lower() for word in text if word not in stopwords.words('english')]
    return filtered_words

data['text_array'] = data.loc[:,'cleaned_statement'].apply(clean_stopwords)
data.head(3)

Unnamed: 0,Statement,Label,cleaned_statement,text_array
0,Says the Annies List political group supports ...,False,Says the Annies List political group supports ...,"[says, annies, list, political, group, support..."
1,When did the decline of coal start? It started...,True,When did the decline of coal start It started...,"[when, decline, coal, start, it, started, natu..."
2,"Hillary Clinton agrees with John McCain ""by vo...",True,Hillary Clinton agrees with John McCain by vo...,"[hillary, clinton, agrees, john, mccain, votin..."


## Create text tokens from data

In [28]:
word_to_ix = {}
total_text_array = []
for text_array in data['text_array']:
    for word in text_array:
        total_text_array.append(word)

random.shuffle(total_text_array)    

for word in total_text_array:
    if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)

word_to_ix

{'health': 0,
 'trump': 1,
 'best': 2,
 'one': 3,
 'despite': 4,
 'get': 5,
 'accommodate': 6,
 'thanks': 7,
 'capita': 8,
 'general': 9,
 'losses': 10,
 'religious': 11,
 'allied': 12,
 'last': 13,
 'furloughed': 14,
 'suspend': 15,
 'mistake': 16,
 'requiring': 17,
 'two': 18,
 'university': 19,
 'theu': 20,
 'driving': 21,
 'david': 22,
 'americas': 23,
 'after': 24,
 'killed': 25,
 'long': 26,
 'isnt': 27,
 'tax': 28,
 'river': 29,
 'jerseys': 30,
 'climate': 31,
 'time': 32,
 'ohios': 33,
 'increased': 34,
 'confined': 35,
 'staff': 36,
 'lower': 37,
 'grown': 38,
 'pay': 39,
 'job': 40,
 'people': 41,
 'providences': 42,
 'unemployment': 43,
 'opposed': 44,
 'there': 45,
 'taft': 46,
 'ii': 47,
 'adults': 48,
 'the': 49,
 'year': 50,
 'says': 51,
 'paid': 52,
 'infarction': 53,
 'running': 54,
 'maddox': 55,
 'gallon': 56,
 'regulations': 57,
 'candidate': 58,
 'rate': 59,
 'passing': 60,
 'changed': 61,
 'pace': 62,
 'goes': 63,
 'i': 64,
 'states': 65,
 'asthma': 66,
 'elementa

In [29]:
def text_tokenizer(x):
    tokens = []
    for text in x:
        token = word_to_ix[text]
        tokens.append(token)
    return tokens

data['text_token'] = data.loc[:,'text_array'].apply(text_tokenizer)
data.head(3)

Unnamed: 0,Statement,Label,cleaned_statement,text_array,text_token,label_one_hot
0,Says the Annies List political group supports ...,False,Says the Annies List political group supports ...,"[says, annies, list, political, group, support...","[51, 8621, 1107, 1291, 683, 1823, 1523, 5606, ...",0
1,When did the decline of coal start? It started...,True,When did the decline of coal start It started...,"[when, decline, coal, start, it, started, natu...","[1053, 2050, 1179, 619, 110, 232, 457, 114, 24...",1
2,"Hillary Clinton agrees with John McCain ""by vo...",True,Hillary Clinton agrees with John McCain by vo...,"[hillary, clinton, agrees, john, mccain, votin...","[107, 389, 3557, 546, 441, 302, 500, 881, 584,...",1


## Create One-hot Encoding

In [30]:
label_to_ix = {"True": 1, "False": 0}

def label_one_hot(x):
    return 1 if x else 0

data['label_one_hot'] = data.loc[:,'Label'].apply(label_one_hot)
data.head(3)

Unnamed: 0,Statement,Label,cleaned_statement,text_array,text_token,label_one_hot
0,Says the Annies List political group supports ...,False,Says the Annies List political group supports ...,"[says, annies, list, political, group, support...","[51, 8621, 1107, 1291, 683, 1823, 1523, 5606, ...",0
1,When did the decline of coal start? It started...,True,When did the decline of coal start It started...,"[when, decline, coal, start, it, started, natu...","[1053, 2050, 1179, 619, 110, 232, 457, 114, 24...",1
2,"Hillary Clinton agrees with John McCain ""by vo...",True,Hillary Clinton agrees with John McCain by vo...,"[hillary, clinton, agrees, john, mccain, votin...","[107, 389, 3557, 546, 441, 302, 500, 881, 584,...",1
