In [None]:
import re
import email
import os
from stop_words import get_stop_words
from gensim import corpora, models, utils
from nltk.stem.porter import PorterStemmer
from nltk import clean_html
from bs4 import BeautifulSoup
from collections import defaultdict

import pyLDAvis
import pyLDAvis.gensim

In [None]:
# Processing data definition
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
en_stop = get_stop_words('en')
en_stop.append("com")
en_stop.append("www")
en_stop.append("[IMAGE]")



In [3]:
# Load and extract exmail body
root_dir = os.path.join("C:", os.sep, "dataset", "enron-dataset")
author_name = [ os.path.join(root_dir, dir) for dir in os.listdir(root_dir) 
               if os.path.isdir(os.path.join(root_dir, dir)) ]

msg = {}
for auth_path in author_name:
    # Get list of email in inbox folder
    try: # try if inbox folder exist
        inbox_path = os.path.join(auth_path, "inbox")
        email_file_name = [ os.path.join(inbox_path, file) for file in os.listdir(inbox_path) 
                                       if os.path.isfile(os.path.join(inbox_path, file)) ]
        for file in email_file_name:
            with open(file) as fp:
                # Create a text/plain message
                message = email.message_from_file(fp)
                #print(message._payload)
            msg[file] = message._payload
    except:
        pass

key_list = list(msg.keys())

In [7]:
message._headers

[('Message-ID', '<692316.1075842025145.JavaMail.evans@thyme>'),
 ('Date', 'Wed, 6 Feb 2002 11:57:35 -0800 (PST)'),
 ('From', 'danielle.marcinkowski@enron.com'),
 ('To', 'john.zufferli@enron.com'),
 ('Subject', 'IHS Accumap'),
 ('Cc', 'dan.dietrich@enron.com'),
 ('Mime-Version', '1.0'),
 ('Content-Type', 'text/plain; charset=us-ascii'),
 ('Content-Transfer-Encoding', '7bit'),
 ('Bcc', 'dan.dietrich@enron.com'),
 ('X-From',
  'Marcinkowski, Danielle </O=ENRON/OU=NA/CN=RECIPIENTS/CN=DMARCIN>'),
 ('X-To', 'Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Jzuffer>'),
 ('X-cc', 'Dietrich, Dan </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Ddietri>'),
 ('X-bcc', ''),
 ('X-Folder', '\\ExMerge - Zufferli, John\\Inbox'),
 ('X-Origin', 'ZUFFERLI-J'),
 ('X-FileName', 'john zufferli 6-26-02.PST')]

In [9]:
print(message._payload)

John:

Do you need Accumap day one?  Carmen said that it had not been renewed so do you have access to it now?  
If you do need an account how many and for who?

thanks,

Danielle 


In [10]:
print(msg[key_list[46]])


Team
FYI

Sheri
The 3 SME's that have already committed to being on film need to be keep in the lope regarding the timeline. Also check with these 2 SME's for additional candidates (i.e. check with Mark Reese to see if Lamar Frazier would be interested). I would like to be conferenced in on the media calls & meetings. Also cc me on their emails or forward to me  their  emails. This will keep me in the loop. Thanks

Phillip
If you have some additional people in mind, we need them identified as soon as possible. Thanks

Cheers
Kirk
---------------------- Forwarded by Kirk McDaniel/HOU/EES on 11/27/2001 10:04 AM ---------------------------


sheri.a.righi@accenture.com on 11/27/2001 09:09:46 AM
To:	kmcdani@enron.com
cc:	donald.l.barnhart@accenture.com 
Subject:	RE: Updates to our Video Production Timeframes and Scope


Kirk -

Thank you for helping us work towards sign-off from Enron Legal. Seeing as
you haven't been closely involved, I thought you'd like an update on our
media production

In [None]:
# Message pre processing

cleaned_msg = {}
max_count = 100
count = 0
for key, message in msg.items():
    #Remove the original message from the body
    clean_body = message[0:message.find("-----Original Message-----")]
    # Remove all URL links
    body_linkless = re.sub(r"http\S+", "", clean_body)
    # Remove html code
    soup = BeautifulSoup(body_linkless)
    body_linkless_html = soup.get_text()
    # Remove [image]
    body_linkless_html_image = body_linkless_html.replace("[IMAGE]", "")
    cleaned_msg[key] = body_linkless_html_image
    count += 1
    if count > max_count:
        break

In [None]:
len(cleaned_msg)

In [None]:
for key, mes in cleaned_msg.items():
    print(key)
    print(mes)
    print("OTHER EMAIL _______________________________")
    print("-"*100)

In [None]:
def word_frequency_filter(texts, count_threshold):
    """Remove word whose frequency is less than a count threshold
    """

    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    return [[token for token in text if frequency[token] > count_threshold] for text in texts ] 

In [None]:
# Text preprocess
processed_msg = {}
raw_message = []
for key, document in cleaned_msg.items():
    # raw = document.lower() # use less with simple token
    tokens = utils.simple_preprocess(document)
    # print(tokens)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if (not i in en_stop)]
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    print(key)
    # Lemmatize ?
    print(stemmed_tokens)
    # Remove tokens that not appear enough in the corpus
    # remove words that appear only once
    st_tokens = word_frequency_filter(stemmed_tokens, 2)
    print("-"*100)
    processed_msg[key] = stemmed_tokens
    
    raw_message.append(document)


In [None]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary([ email for _, email in processed_msg.items()])

In [None]:
number_topic = 7
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for _, text in processed_msg.items()]

# generate LDA model
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=number_topic, id2word=dictionary, passes=100)

ldamodel.print_topics(num_topics = number_topic, num_words=10)

In [None]:
for count, c in enumerate(ldamodel[corpus]):
    print("document number : {}".format(count))
    print("Topics number      : ", c[0][0])
    print(ldamodel.print_topic(c[0][0]))
    print("similarity index : {}".format(c[0][1]))
    print("------------------------------------------------------\n")

In [None]:
for count, c in enumerate(ldamodel[corpus]):
    print("document number : {}".format(count))
    print(cleaned_msg[key_list[count]])
    print("Topics number      : ", c[0])
    print(ldamodel.print_topic(c[0][0]))
    print("similarity index : {}".format(c[0][1]))
    print("------------------------------------------------------\n")

In [None]:
vis = pyLDAvis.gensim.prepare(topic_model=ldamodel, corpus=corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

In [None]:
ldamodel.get_document_topics(corpus[0])

In [None]:
ldamodel.get_topics()

In [None]:
ldamodel.show_topics()

In [None]:
ldamodel.get_topic_terms()

In [None]:
topic = 0
for count, c in enumerate(ldamodel[corpus]):
    if c[0][0] == topic:
        print("document number : {}".format(count))
        print(cleaned_msg[key_list[count]])
        print("Topics number      : ", c[0])
        print(ldamodel.print_topic(c[0][0]))
        print("similarity index : {}".format(c[0][1]))
        print("------------------------------------------------------\n")

In [12]:
aa = {"col": ["a", "b", "c", "D", "EEE"]}
aa

{'col': ['a', 'b', 'c', 'D', 'EEE']}

In [13]:
import pandas as pd

In [20]:
bb = pd.DataFrame(aa)
bb

Unnamed: 0,col
0,a
1,b
2,c
3,D
4,EEE


In [21]:
def ll(s):
    return s.lower()


édfjdkdjdé


In [25]:
bb["col"].str.lower()

0      a
1      b
2      c
3      d
4    eee
Name: col, dtype: object

In [26]:
bb

Unnamed: 0,col
0,a
1,b
2,c
3,D
4,EEE
