In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

## Load data

In [3]:
TRAIN_CSV_PATH = "./data/train.csv"
data = pd.read_csv(TRAIN_CSV_PATH, index_col=False)
data.head(3)

Unnamed: 0,Statement,Label
0,Says the Annies List political group supports ...,False
1,When did the decline of coal start? It started...,True
2,"Hillary Clinton agrees with John McCain ""by vo...",True


## Clean digits, punctuation and symbols

In [11]:
def clean_text(x):
    text = ''.join([i for i in x if not i.isdigit()])
    text = re.sub("[!\?@^&.,/#$+%*:()'\"-]", ' ', text)
    return text

data['cleaned_statement'] = data.loc[:,'Statement'].apply(clean_text)
data.head(3)

Unnamed: 0,Statement,Label,cleaned_statement,text_array,label_one_hot
0,Says the Annies List political group supports ...,False,Says the Annies List political group supports ...,"[says, annies, list, political, group, support...",0
1,When did the decline of coal start? It started...,True,When did the decline of coal start It started...,"[when, decline, coal, start?, it, started, nat...",1
2,"Hillary Clinton agrees with John McCain ""by vo...",True,Hillary Clinton agrees with John McCain by vo...,"[hillary, clinton, agrees, john, mccain, votin...",1


## Download NLTK Stopwords model

In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LouisMM\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

## Clean stopwords and lowercase

In [12]:
def clean_stopwords(x):
    text = x.split() 
    filtered_words = [word.lower() for word in text if word not in stopwords.words('english')]
    return filtered_words

data['text_array'] = data.loc[:,'cleaned_statement'].apply(clean_stopwords)
data.head(3)

Unnamed: 0,Statement,Label,cleaned_statement,text_array,label_one_hot
0,Says the Annies List political group supports ...,False,Says the Annies List political group supports ...,"[says, annies, list, political, group, support...",0
1,When did the decline of coal start? It started...,True,When did the decline of coal start It started...,"[when, decline, coal, start, it, started, natu...",1
2,"Hillary Clinton agrees with John McCain ""by vo...",True,Hillary Clinton agrees with John McCain by vo...,"[hillary, clinton, agrees, john, mccain, votin...",1


## Create word embedding from data

In [13]:
word_to_ix = {}

for text_array in data['text_array']:
    for word in text_array:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

word_to_ix

{'says': 0,
 'annies': 1,
 'list': 2,
 'political': 3,
 'group': 4,
 'supports': 5,
 'third': 6,
 'trimester': 7,
 'abortions': 8,
 'demand': 9,
 'when': 10,
 'decline': 11,
 'coal': 12,
 'start': 13,
 'it': 14,
 'started': 15,
 'natural': 16,
 'gas': 17,
 'took': 18,
 'begin': 19,
 'president': 20,
 'george': 21,
 'w': 22,
 'bushs': 23,
 'administration': 24,
 'hillary': 25,
 'clinton': 26,
 'agrees': 27,
 'john': 28,
 'mccain': 29,
 'voting': 30,
 'give': 31,
 'bush': 32,
 'benefit': 33,
 'doubt': 34,
 'iran': 35,
 'health': 36,
 'care': 37,
 'reform': 38,
 'legislation': 39,
 'likely': 40,
 'mandate': 41,
 'free': 42,
 'sex': 43,
 'change': 44,
 'surgeries': 45,
 'the': 46,
 'economic': 47,
 'turnaround': 48,
 'end': 49,
 'term': 50,
 'chicago': 51,
 'bears': 52,
 'starting': 53,
 'quarterbacks': 54,
 'last': 55,
 'years': 56,
 'total': 57,
 'number': 58,
 'tenured': 59,
 'uw': 60,
 'faculty': 61,
 'fired': 62,
 'two': 63,
 'decades': 64,
 'jim': 65,
 'dunnam': 66,
 'lived': 67,
 'd

## Create One-hot Encoding

In [14]:
label_to_ix = {"True": 1, "False": 0}

def label_one_hot(x):
    return 1 if x else 0

data['label_one_hot'] = data.loc[:,'Label'].apply(label_one_hot)
data.head(3)

Unnamed: 0,Statement,Label,cleaned_statement,text_array,label_one_hot
0,Says the Annies List political group supports ...,False,Says the Annies List political group supports ...,"[says, annies, list, political, group, support...",0
1,When did the decline of coal start? It started...,True,When did the decline of coal start It started...,"[when, decline, coal, start, it, started, natu...",1
2,"Hillary Clinton agrees with John McCain ""by vo...",True,Hillary Clinton agrees with John McCain by vo...,"[hillary, clinton, agrees, john, mccain, votin...",1
