# Assessment 3

Compare without reply/forward

In [1]:
import glob
import os
import numpy as np
import pandas as pd
import math

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(2021)

import nltk

First we need to import the data. References 1 & 2 introduced me to glob which then allows us to import all the text documents in one smooth process. We do this below.

In [2]:
from pathlib import Path

corpus = []

for path in Path(r'C:\Users\corri\OneDrive\Documents\GitHub\DST-Assessment-3\Data\All enron unzip').rglob('*.txt'):
        with open(path, errors='ignore') as f_input:
            corpus.append(f_input.read())

In [3]:
df = pd.DataFrame(corpus)

In [4]:
df.head()

Unnamed: 0,0
0,Legitimate\n----------\n- Owner: farmer-d\n- T...
1,Subject: christmas tree farm pictures\n
2,"Subject: vastar resources , inc .\ngary , prod..."
3,Subject: calpine daily gas nomination\n- calpi...
4,Subject: re : issue\nfyi - see note below - al...


In [5]:
print(len(df))

27721


While importing the files, we've also imported the summary sheets along with them. These need to be removed as they wont contribute to our overall corpus and we do this below along with reindexing the data set (we could just remove them from the data but this is easy).

In [6]:
df = df.drop([0,5173,11031,16544,21720]) # removing unnecessary entries
nlist = np.arange(0,27717,1) # reindexing
df = df.reindex(nlist)
df = df.drop(0)
df.columns = ['E-mails']

In [7]:
df.head()

Unnamed: 0,E-mails
1,Subject: christmas tree farm pictures\n
2,"Subject: vastar resources , inc .\ngary , prod..."
3,Subject: calpine daily gas nomination\n- calpi...
4,Subject: re : issue\nfyi - see note below - al...
5,Subject: meter 7268 nov allocation\nfyi .\n- -...


We're going to assign a binary variable for spam vs normal data to our data set to allow us to perform performance analysis later on. This is done simply since we know where the splits will be for spam/normal data will be in our data frame and we do this below by first creating a list of the positions where spam emails are found and then assigning the binary values to our data frame.

In [8]:
z1 = np.arange(3673,5173)
z2 = np.arange(9534,11031)
z3 = np.arange(15043,16544)
z4 = np.arange(18043,21719)
z5 = np.arange(23220,27716)
ze = np.concatenate((z1,z2,z3,z4,z5))

In [9]:
z = np.zeros(27716)
df['spam'] = z
for i in ze:
    df.iloc[i,1] = 1

In [10]:
df.head()

Unnamed: 0,E-mails,spam
1,Subject: christmas tree farm pictures\n,0.0
2,"Subject: vastar resources , inc .\ngary , prod...",0.0
3,Subject: calpine daily gas nomination\n- calpi...,0.0
4,Subject: re : issue\nfyi - see note below - al...,0.0
5,Subject: meter 7268 nov allocation\nfyi .\n- -...,0.0


In [11]:
df_temp = df

In [12]:
for i in range(0,9):
    if i == 8:
        a = df_temp
        locals()['df_{}'.format(i)] = a
    else:
        a = df_temp.sample(n = 2771)
        locals()['df_{}'.format(i)] = a
        l = list(a.index.values)
        df_temp = df_temp.drop(l)

In [13]:
df_test = df_temp

frames=[df_0, df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8]
df_train = pd.concat(frames)

### Pre-Processing for LDA Model

In this section we perform pre-processing for our model. This involves techniques such as removing common words, symbols and punctuation. For sake of comparison, we print the first entry to determine the impact the techniques have.

In [14]:
print(df_train.iloc[[1,0]].values)

[['Subject: unfaithful bitches\nwives looking for fun\nsearch no more ! compatible sxex partners are here …\nlonely married wives looking for an adventure\nhttp : / / www . hottyplay . com / spi / date . php\n* membership is required in order to verify legal age , only $ 1\nif you have found a lady or not to be paired up then continue\nhere\n'
  1.0]
 ['Subject: january giveaway\nspecial offfer for m ! cros 0 ft wlndows xp home\nwe might have just what you need :\nwlndows x . p pro + office x * p pro for as low as 8 o $\nplease 0 . r . d . e . r here\nthe offer is valid till february 14 th\nstock is limited\nupdate your info\ncherie moseley\nmanualtherapist\nmurinus gmbh , 20251 hamburg , germany\nphone : 684 - 363 - 4883\nmobile : 117 - 448 - 6918\nemail : stneojugjcsd @ attorney . com\nthis is a confirmation message\nthis software is a 14 minute complementary freeware\nnotes :\nthe contents of this information is for attention and should not be downcast ashman\ngantlet dragnet obstac

In [15]:
en_stop = set(nltk.corpus.stopwords.words('english'))

In [16]:
import nltk
from nltk.corpus import wordnet

lmtzr = nltk.WordNetLemmatizer().lemmatize

## We lookup whether a word is and adjective, verb, noun or adverb here.
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

    
## This version uses word type. Needs the bigger nltp download ("popular")
def normalize_text(text):
    ## Runs on documents (vector of words)
    word_pos = nltk.pos_tag(nltk.word_tokenize(text))
    lemm_words = [lmtzr(sw[0], get_wordnet_pos(sw[1])) for sw in word_pos]

    return [x.lower() for x in lemm_words]

## This version doesn't require the "popular" download
def preprocess(text):
    ## Runs on documents (vector of words)
    lemmatizer = nltk.WordNetLemmatizer()
    return([lemmatizer.lemmatize(i) for i in text.split()])

################
## wordnet version
from nltk.corpus import wordnet as wn
def get_lemma(word):
    ## morphy does a lemma lookup and word standardization
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

## lemmatize
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

## This version is for comparison
def prepare_text_for_lda(text):
    ## Runs on documents (vector of words)
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [17]:
from gensim import parsing
doc_sample = df_train.iloc[1].values[0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(normalize_text(doc_sample))
print('\n\n simpler tokenized and lemmatized document: ')
print(preprocess(doc_sample))
print('\n\n method removing stop words: ')
print(prepare_text_for_lda(doc_sample))

original document: 
['Subject:', 'unfaithful', 'bitches\nwives', 'looking', 'for', 'fun\nsearch', 'no', 'more', '!', 'compatible', 'sxex', 'partners', 'are', 'here', '…\nlonely', 'married', 'wives', 'looking', 'for', 'an', 'adventure\nhttp', ':', '/', '/', 'www', '.', 'hottyplay', '.', 'com', '/', 'spi', '/', 'date', '.', 'php\n*', 'membership', 'is', 'required', 'in', 'order', 'to', 'verify', 'legal', 'age', ',', 'only', '$', '1\nif', 'you', 'have', 'found', 'a', 'lady', 'or', 'not', 'to', 'be', 'paired', 'up', 'then', 'continue\nhere\n']


 tokenized and lemmatized document: 
['subject', ':', 'unfaithful', 'bitch', 'wife', 'look', 'for', 'fun', 'search', 'no', 'more', '!', 'compatible', 'sxex', 'partner', 'be', 'here', '…', 'lonely', 'marry', 'wife', 'look', 'for', 'an', 'adventure', 'http', ':', '/', '/', 'www', '.', 'hottyplay', '.', 'com', '/', 'spi', '/', 'date', '.', 'php', '*', 'membership', 'be', 'require', 'in', 'order', 'to', 'verify', 'legal', 'age', ',', 'only', '$', '1', 

In [18]:
#processed_text = df_train['E-mails'].map(prepare_text_for_lda)
#processed_text.head()

When running the above code, we identify entries in the code that are classified as NaN. The pre-processor cannot handle these exceptions so we remove them as they are immaterial to the final result anyway.

In [19]:
place = []

for j in range(1,len(df_train)):
    try:
        a = df_train.iloc[j,0]
        prepare_text_for_lda(a)
    except Exception:
       place.append(j) 

for j in place:
    df_train.iloc[j,0] = ""

In [20]:
processed_text = df_train['E-mails'].map(prepare_text_for_lda) 
processed_text.head()

15773    [Subject, january, giveaway, special, offfer, ...
25178    [Subject, unfaithful, bitch, wife, looking, se...
3695          [Subject, price, sildenafil, citrate, today]
4894     [Subject, guillermo, budget, today, glory, bec...
13018    [Subject, competitive, analysis, update, respo...
Name: E-mails, dtype: object

In [21]:
dictionary = gensim.corpora.Dictionary(processed_text)

count = 0
for k,v  in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 20251
1 Subject
2 ashman
3 attention
4 attorney
5 cherie
6 complementary
7 confirmation
8 contents
9 downcast
10 dragnet


In [22]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_text]

### LDA Model

In [23]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 20, id2word=dictionary, passes=2, workers = 2)

In [24]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [25]:
lda_display = pyLDAvis.gensim.prepare(lda_model, bow_corpus, 
                                          dictionary, mds='mmds')

  and should_run_async(code)


In [26]:
pyLDAvis.display(lda_display, template_type='notebook')

  and should_run_async(code)


### Perplexity and Coherence

In [27]:
from gensim.models.coherencemodel import CoherenceModel
def getCoherence(m,c,d):
    coherence_model_lda = CoherenceModel(model=m,corpus=c, dictionary=d, coherence='u_mass')
    coherence_lda = coherence_model_lda.get_coherence()
    return(coherence_lda)

  and should_run_async(code)


In [28]:
print("lda_model perplexity: {}".format(lda_model.log_perplexity(bow_corpus[0:1000])))
print("lda_model coherence: {}".format(getCoherence(lda_model,bow_corpus[0:1000],dictionary)))

  and should_run_async(code)


lda_model perplexity: -12.445476062593611
lda_model coherence: -2.5721437190333063


### References

1. [Loading .txt files](https://stackoverflow.com/questions/42407976/loading-multiple-text-files-from-a-folder-into-a-python-list-variable)
2. [Using Glob to retrieve the data](https://stackoverflow.com/questions/2186525/how-to-use-glob-to-find-files-recursively)