In [1]:
import re
import os
from stop_words import get_stop_words
from gensim import corpora, models, utils
from nltk.stem.porter import PorterStemmer
from nltk import clean_html
from bs4 import BeautifulSoup
from collections import defaultdict

import pyLDAvis
import pyLDAvis.gensim

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from upskills.email_classification import preprocessing

In [3]:
# Read emails from csv

file_name = os.path.join("C:", os.sep, "dataset", "personnal_emails", "email_perso.csv")
full_data = pd.read_csv(file_name, encoding='latin1')
body = full_data["Body"]
subjects = full_data["Subject"]
print(full_data)

                                              Subject         Date Sent  \
0                                Welcome to Upskills!  14/05/2019 09:04   
1   Reminder - Upskills Brown Bag Session - Confir...  13/05/2019 11:53   
2   Canceled: Upskills Training - Fundamental Revi...  13/05/2019 09:33   
3      You've been added to a team in Microsoft Teams  14/05/2019 11:41   
4            Asset Delivery Sign-off, action required  14/05/2019 12:14   
5                  Scb agrees to award us the project  14/05/2019 16:04   
6              RE: Scb agrees to award us the project  14/05/2019 16:19   
7                                   RE: OCBC tomorrow  15/05/2019 17:00   
8                                   Azure Credentials  17/05/2019 10:06   
9                               Re: Azure Credentials  17/05/2019 10:38   
10                                     FW: Case Study  17/05/2019 11:29   
11                              Re: Azure Credentials  17/05/2019 12:07   
12                       

In [4]:
# Clean subjects : remove RE and FWD
sub = []
for row_id in subjects.index:
    print(subjects.iloc[row_id])
    subject = subjects.iloc[row_id]
    normalized_subject = preprocessing.remove_prefix(subject)
    print(normalized_subject)
    if len(normalized_subject) >= 1:
        sub.append(normalized_subject)
        print("___________")
    
unique_subjects = list(set(sub))
#print(sub)
print(set(sub))

Welcome to Upskills!
Welcome to Upskills!
___________
Reminder - Upskills Brown Bag Session - Confirmation Matcher
Reminder - Upskills Brown Bag Session - Confirmation Matcher
___________
Canceled: Upskills Training - Fundamental Review of Trading Book (FRTB) Part 2
Canceled: Upskills Training - Fundamental Review of Trading Book (FRTB) Part 2
___________
You've been added to a team in Microsoft Teams
You've been added to a team in Microsoft Teams
___________
Asset Delivery Sign-off, action required
Asset Delivery Sign-off, action required
___________
Scb agrees to award us the project
Scb agrees to award us the project
___________
RE: Scb agrees to award us the project
Scb agrees to award us the project
___________
RE: OCBC tomorrow
OCBC tomorrow
___________
Azure Credentials
Azure Credentials
___________
Re: Azure Credentials
Azure Credentials
___________
FW: Case Study
Case Study
___________
Re: Azure Credentials
Azure Credentials
___________
Re: Azure Credentials
Azure Credentials


In [5]:
len(unique_subjects)

38

In [6]:
msg = {}
for row_id in body.index:
    msg[row_id] = body.iloc[row_id]

key_list = list(msg.keys())

In [7]:
msg

{0: 'Dear Aurelien,  We warmly welcome you at Upskills and believe we will have some extraordinary journey together!  Please find below some useful information:  \x95 Confidentiality of Data: please be reminded that the information you might be given access to can be confidential and restricted.  Always be mindful of your obligations in regards of accessing, transmitting or otherwise using confidential/proprietary information, whether it be Upskills internal, or from one of the customers you are assigned to.  In particular never attempt to copy, transmit, or otherwise extract any information(data, programs, code, documents, etc..) to outside of the organization, in any form, without prior written approval from your management and/or the customer.  Do refer to the employee handbook for more details, and take note of the possible severe consequences in terms of breach.  \x95 Passwords: Please make sure to keep your passwords safe and secure and do not share with anyone (except if request

In [8]:
# Processing data definition
# Create p_stemmer of class PorterStemmer # should lemmatize
p_stemmer = PorterStemmer()
en_stop = get_stop_words('en')
en_stop.append("com")
en_stop.append("www")
en_stop.append("[IMAGE]")

In [9]:
# Message pre processing
cleaned_msg = {}
minimal_character = 70
max_count = 70
count = 0

# Email specific expressions to remove
patterns = ["CONFIDENTIAL NOTE.*", "_", "(Get|Sent).*<", "[IMAGE]"]

for __, message in msg.items():
    clean_body = message
    # Remove all URL links
    body_linkless = re.sub(r"http\S+", "", clean_body)
    # Remove html code
    body_linkless = preprocessing.remove_html_from_body(body_linkless)
    # Remove email redundant fields
    body_linkless = preprocessing.remove_previous_mail_adress(body_linkless)
    # Remove email subject abbreviations
    body_linkless = preprocessing.remove_prefix(body_linkless)
    # Remove other specific email parts through regex
    body_linkless = preprocessing.remove_regex_patterns(body_linkless, patterns)

    # Save only if contains minimal token number
    if len(body_linkless) > minimal_character:
        cleaned_msg[count] = body_linkless
        count += 1
        if count > max_count:
            break

In [10]:
for key, mes in cleaned_msg.items():
    print(key)
    print(mes)
    print("-"*100)

0
Dear Aurelien,  We warmly welcome you at Upskills and believe we will have some extraordinary journey together!  Please find below some useful information:   Confidentiality of Data: please be reminded that the information you might be given access to can be confidential and restricted.  Always be mindful of your obligations in regards of accessing, transmitting or otherwise using confidential/proprietary information, whether it be Upskills internal, or from one of the customers you are assigned to.  In particular never attempt to copy, transmit, or otherwise extract any information(data, programs, code, documents, etc..) to outside of the organization, in any form, without prior written approval from your management and/or the customer.  Do refer to the employee handbook for more details, and take note of the possible severe consequences in terms of breach.   Passwords: Please make sure to keep your passwords safe and secure and do not share with anyone (except if requested by aut

In [None]:
def word_frequency_filter(texts, count_threshold):
    """Remove word whose frequency is less than a count threshold
    """

    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    return [[token for token in text if frequency[token] > count_threshold] for text in texts ] 

In [None]:
# Text preprocess
processed_msg = {}
raw_message = []
for key, document in cleaned_msg.items():
    # raw = document.lower() # use less with simple token
    tokens = utils.simple_preprocess(document, min_len=3)
    # print(tokens)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if (not i in en_stop)]
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    print(key)
    # Lemmatize ?
    print(stemmed_tokens)
    # Remove tokens that not appear enough in the corpus
    # remove words that appear only once
    st_tokens = word_frequency_filter(stemmed_tokens, 2)
    print("-"*100)
    
    processed_msg[key] = stemmed_tokens
    
    
    raw_message.append(document)


In [None]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary([ email for _, email in processed_msg.items()])

In [None]:
number_topic = 7
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for _, text in processed_msg.items()]

# generate LDA model
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=number_topic, id2word=dictionary, passes=100)

ldamodel.print_topics(num_topics = number_topic, num_words=10)

In [None]:
for count, c in enumerate(ldamodel[corpus]):
    print(c)
    print("document number : {}".format(count))   
    print("------------------------------------------------------\n")

In [None]:
for count, c in enumerate(ldamodel[corpus]):
    print("document number : {}".format(count))
    print(cleaned_msg[key_list[count]])
    print("Results     : ", c)
    print("------------------------------------------------------\n")

In [None]:
vis = pyLDAvis.gensim.prepare(topic_model=ldamodel, corpus=corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

In [None]:
ldamodel.get_document_topics(corpus[0])

In [None]:
ldamodel.get_topics()

In [None]:
ldamodel.show_topics()

In [None]:
topic = 4
for count, c in enumerate(ldamodel[corpus]):
    for cc in c:
        if cc[0] == topic:
            print("document number : {}".format(count))
            print(cleaned_msg[key_list[count]])
            print("\n")
            print("Topics number      : ", c[0])
            print(ldamodel.print_topic(c[0][0]))
            print("\n")
            print("similarity index : {}".format(c[0][1]))
            print("\n")
            print(processed_msg[count])
            print("------------------------------------------------------\n")

In [None]:
msg_len = [len(mess) for _, mess in cleaned_msg.items()]
plt.hist(msg_len, 100)

Minimal length of documet should be 70 character

Use of stemming might not be appropriate, lemmatization could be tested instead. 