# Preprocessing

## Reading into DataFrame

In [160]:
import numpy as np
import pandas as pd
import os
import codecs
import re

In [151]:
def cleanText(text):
    # This function takes in a text string and cleans it 
    # by keeping only alphanumeric and common punctuations
    # Returns the cleaned string
    clean_text = text.replace('\n','').replace('\r','')
    clean_text = re.sub(r'[^a-zA-Z0-9.:!? ]','',clean_text)
    return clean_text

In [152]:
def readEmail(path):
    # This function takes a path to an email text file
    # Returns a tuple of the subject line and the body text
    with codecs.open(path, "r",encoding='utf-8', errors='ignore') as f:
        subject = cleanText(f.readline()[9:])
        body = cleanText(f.read())
        return [subject, body]

In [153]:
subjects = []
bodys = []
spam = []
# read hams
for filename in os.listdir("enron1/ham"):
    if filename.endswith(".txt"):
        subject, body = readEmail("enron1/ham/"+filename)
        subjects.append(subject)
        bodys.append(body)
        spam.append(0)
# read spams
for filename in os.listdir("enron1/spam"):
    if filename.endswith(".txt"):
        subject, body = readEmail("enron1/spam/"+filename)
        subjects.append(subject)
        bodys.append(body)
        spam.append(1)

In [154]:
data = pd.DataFrame()
data['subject'] = subjects
data['body'] = bodys
data['spam'] = spam

In [155]:
data.head()

Unnamed: 0,subject,body,spam
0,christmas tree farm pictures,,0
1,vastar resources inc .,gary production from the high island larger b...,0
2,calpine daily gas nomination,calpine daily gas nomination 1 . doc,0
3,re : issue,fyi see note below already done .stella ...,0
4,meter 7268 nov allocation,fyi . forwarded by lauri ...,0


In [156]:
# tokenize
from nltk.tokenize import WordPunctTokenizer
import string
tokenizer = WordPunctTokenizer()
new_body = []
for body in data['body']:
    wtks = tokenizer.tokenize(body)
    new_body.append([wtk for wtk in wtks if wtk not in string.punctuation])

In [176]:
data['tokens'] = new_body
data['idx'] = np.arange(data.shape[0])
data.head()

Unnamed: 0,subject,body,spam,tokens,idx
0,christmas tree farm pictures,,0,[],0
1,vastar resources inc .,gary production from the high island larger b...,0,"[gary, production, from, the, high, island, la...",1
2,calpine daily gas nomination,calpine daily gas nomination 1 . doc,0,"[calpine, daily, gas, nomination, 1, doc]",2
3,re : issue,fyi see note below already done .stella ...,0,"[fyi, see, note, below, already, done, stella,...",3
4,meter 7268 nov allocation,fyi . forwarded by lauri ...,0,"[fyi, forwarded, by, lauri, a, allen, hou, ect...",4


In [185]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from gensim import utils
import gensim.utils
from sklearn.linear_model import LogisticRegression

In [195]:
sentences = data.apply(lambda row : LabeledSentence(row['tokens'],[row['spam']]), axis = 1)
d2v_model = Doc2Vec(min_count = 1, window = 10, size=100, sample=1e-4, negative=5, workers=8)

In [197]:
d2v_model.build_vocab(sentences)

In [199]:
for epoch in range(10):
    d2v_model.train(sentences.reindex(np.random.permutation(sentences.index)))

In [200]:
d2v_model.save('test.d2v')

In [212]:
d2v_model.most_similar('hurry')

[('now', 0.4317642152309418),
 ('discount', 0.4178639054298401),
 ('store', 0.39517587423324585),
 ('biggest', 0.3790737986564636),
 ('load', 0.37169861793518066),
 ('spoil', 0.367867112159729),
 ('a', 0.36497950553894043),
 ('hotel', 0.36494749784469604),
 ('site', 0.3580681383609772),
 ('visit', 0.35701173543930054)]