In [5]:
import pandas as pd
import numpy as np
import gc

import nltk
from nltk.corpus import stopwords
import re

In [6]:
train_path = "data/train.csv"
test_path = "data/test.csv"
submit_path = "data/submit.csv"

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_submit = pd.read_csv(submit_path)

df_train

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [7]:
df_test["label"] = df_submit["label"]

In [8]:
# Filter 
def filter(texts):
    clean_data_array = []
    i = 0
    for text in texts:
        text = str(text)
        filteredText = re.sub('[^A-Za-z]+', ' ', text)
        words = nltk.word_tokenize(filteredText)
        stopwords = nltk.corpus.stopwords.words("english")
        cleaned = [word for word in words if word not in stopwords]
        cleaned = [word.lower() for word in words if (word not in stopwords) and len(word) > 2]
        clean_data_array.append(cleaned)

    return clean_data_array

In [9]:
filtered_texts_train = filter(df_train["text"])

In [10]:
filtered_texts_test = filter(df_test["text"])

In [11]:
print(filtered_texts_train[0])

['house', 'dem', 'aide', 'didn', 'even', 'see', 'comey', 'letter', 'until', 'jason', 'chaffetz', 'tweeted', 'darrell', 'lucus', 'october', 'subscribe', 'jason', 'chaffetz', 'stump', 'american', 'fork', 'utah', 'image', 'courtesy', 'michael', 'jolley', 'available', 'creative', 'commons', 'license', 'with', 'apologies', 'keith', 'olbermann', 'doubt', 'worst', 'person', 'the', 'world', 'week', 'fbi', 'director', 'james', 'comey', 'but', 'according', 'house', 'democratic', 'aide', 'looks', 'like', 'also', 'know', 'second', 'worst', 'person', 'well', 'turns', 'comey', 'sent', 'infamous', 'letter', 'announcing', 'fbi', 'looking', 'emails', 'may', 'related', 'hillary', 'clinton', 'email', 'server', 'ranking', 'democrats', 'relevant', 'committees', 'hear', 'comey', 'they', 'found', 'via', 'tweet', 'one', 'republican', 'committee', 'chairmen', 'know', 'comey', 'notified', 'republican', 'chairmen', 'democratic', 'ranking', 'members', 'house', 'intelligence', 'judiciary', 'oversight', 'committees

In [12]:
print(len(filtered_texts_train))
print(len(df_train["text"]))

20800
20800


In [13]:
# Create dataframe from 2 vectors and filter
df_train = pd.DataFrame({'text': filtered_texts_train, 'label': df_train["label"]})
df_test = pd.DataFrame({'text': filtered_texts_test, 'label': df_test["label"]})


In [14]:
df_train_shorter = df_train[df_train['text'].str.len() > 10]
df_test_shorter = df_test[df_test['text'].str.len() > 10]

In [15]:
frames = [df_train_shorter, df_test_shorter]
df = pd.concat(frames, ignore_index=True)

In [26]:
# Create vocabulary
import json

vocabulary, index   = {}, 1
vocabulary["<pad>"] = 0 # padding token

for tokens in (df["text"]):
    for token in tokens:
        if token not in vocabulary:
            vocabulary[token] = index
            index += 1
    
vocabulary_size = len(vocabulary)

vocabulary
with open("vocabulary.json", 'w') as file:
    json.dumps(vocabulary, indent=4)

In [25]:
# Create inverse vocabulary


inverse_vocab = {index: token for token, index in vocabulary.items()}

inverse_vocab
with open("inverse_vocabulary.json", 'w') as file:
    json.dump(inverse_vocab, file, indent=4)

In [24]:
# Vectorize

vectorized_sequence = [vocabulary[word] for word in tokens]

vectorized_sequence

[431,
 3425,
 407,
 2386,
 673,
 1341,
 4814,
 7715,
 202,
 1013,
 309,
 2724,
 216,
 715,
 4796,
 38971,
 1170,
 2481,
 26147,
 58895,
 10665,
 2175,
 4122,
 11933,
 510,
 72,
 21787,
 202,
 36,
 10665,
 2724,
 4213,
 253,
 1283,
 1397,
 3275,
 2386,
 3608,
 255,
 809,
 46,
 4813,
 42140,
 10616,
 7632,
 34697,
 57,
 47,
 5945,
 2119,
 29898,
 7793,
 41926,
 514,
 332,
 7069,
 8450,
 1873,
 10514,
 5781,
 1837,
 2841,
 777,
 33013,
 267,
 23159,
 6945,
 1180,
 4473,
 6073,
 283,
 2724,
 1862,
 406,
 332,
 1102,
 1317,
 5376,
 3157,
 72,
 9616,
 1151,
 4988,
 92016,
 2724,
 25542,
 7440,
 741,
 5103,
 4522,
 532,
 1798,
 437,
 1176,
 2663,
 982,
 738,
 1000,
 4213,
 1681,
 9214,
 14672,
 9985,
 10666,
 49,
 3600,
 9616,
 927,
 2590,
 122,
 2724,
 1119,
 1130,
 4923,
 7715,
 35878,
 2154,
 180,
 437,
 7081,
 8168,
 5455,
 1815,
 4500,
 2783,
 15265,
 3045,
 8614,
 3156,
 277,
 36,
 127,
 10457,
 483,
 2161,
 861,
 3720,
 164098,
 7516,
 3158,
 9489,
 164099,
 822,
 1593,
 36,
 127,
 626

In [19]:
df_train_shorter.to_csv("train_data.csv", index=False)

In [20]:
df_test_shorter.to_csv("test_data.csv", index=False)