<a href="https://colab.research.google.com/github/swedaa/DS-NLP/blob/main/Copy_of_NLP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing the required Libraries and Packages

In [None]:
#Data Preprocessing and Cleaning
import pandas as pd
import numpy as np
import email
import re

In [None]:
#texthero
!pip install texthero
import texthero as hero

In [None]:
#Model Building
import gensim

import gensim.corpora as corpora
from gensim.utils import simple_preprocess #for tokenization
from gensim.models import CoherenceModel

from gensim.models.phrases import Phrases, Phraser

In [None]:
# spacy for lemmatization
import spacy

# prep NLTK Stop words
import nltk
nltk.download('stopwords')


In [None]:
from nltk.corpus import stopword
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
from pprint import pprint

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

### Reading the data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
mail = pd.read_csv('/content/drive/MyDrive/emails.csv')

In [None]:
mail.head(10)

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...
5,allen-p/_sent_mail/1002.,Message-ID: <30965995.1075863688265.JavaMail.e...
6,allen-p/_sent_mail/1003.,Message-ID: <16254169.1075863688286.JavaMail.e...
7,allen-p/_sent_mail/1004.,Message-ID: <17189699.1075863688308.JavaMail.e...
8,allen-p/_sent_mail/101.,Message-ID: <20641191.1075855687472.JavaMail.e...
9,allen-p/_sent_mail/102.,Message-ID: <30795301.1075855687494.JavaMail.e...


### Data Preparation

### Extracting the body of the mail

In [None]:
# Extracting the body of the mail
def Body(messages):
    column = []
    for message in messages:
        e = email.message_from_string(message)
        column.append(e.get_payload())
    return column

In [None]:
mail['Body'] = Body(mail['message'])

### Extracting the headers

In [None]:
# Adding the fields into the dataframe
def get_field(field, messages):
    column = []
    for message in messages:
        e = email.message_from_string(message)
        column.append(e.get(field))
    return column

In [None]:
mail['Folder'] = get_field("X-Folder", mail['message'])

In [None]:
mail['Folder'].head(10)

0    \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...
1    \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...
2      \Phillip_Allen_Dec2000\Notes Folders\'sent mail
3      \Phillip_Allen_Dec2000\Notes Folders\'sent mail
4      \Phillip_Allen_Dec2000\Notes Folders\'sent mail
5      \Phillip_Allen_Dec2000\Notes Folders\'sent mail
6      \Phillip_Allen_Dec2000\Notes Folders\'sent mail
7      \Phillip_Allen_Dec2000\Notes Folders\'sent mail
8      \Phillip_Allen_Dec2000\Notes Folders\'sent mail
9      \Phillip_Allen_Dec2000\Notes Folders\'sent mail
Name: Folder, dtype: object

### Extracting the folder name

In [None]:
# Extracting the folder name
def preprocess_folder(folders):
    column = []
    for folder in folders:
        if (folder is None or folder == ""):
            column.append(np.nan)
        else:
            column.append(folder.split("\\")[-1].lower())
    return column

In [None]:
mail['Folder'] = preprocess_folder(mail['Folder'])

### Dropping the columns - file and message

In [None]:
del_col = ['file','message']

In [None]:
mail = mail.drop(columns= del_col, axis = 1)

In [None]:
mail.shape

(517401, 2)

In [None]:
pd.set_option('display.max_rows', 5000)
mail.Folder.value_counts()

all documents                                        128103
discussion threads                                    58609
sent                                                  58168
deleted items                                         50987
inbox                                                 41507
sent items                                            37920
notes inbox                                           36665
'sent mail                                            30237
untitled                                               8957
personal                                               2642
attachments                                            2026
meetings                                               1872
calendar                                               1732
schedule crawler                                       1398
logistics                                              1192
tw-commercial group                                    1159
california                              

### Extracting the sent folders - ' sent ', ' sent items ', ' 'sent mail '

In [None]:
sent = mail[mail['Folder']=='sent']

In [None]:
sent_items = mail[mail['Folder']=='sent items']

In [None]:
sent_mail = mail[mail['Folder']=="'sent mail"]

In [None]:
df_rough = pd.concat([sent,sent_items], ignore_index=True)

In [None]:
df_main = pd.concat([df_rough,sent_mail], ignore_index=True)

In [None]:
df_main.shape

(126325, 2)

In [None]:
email_subset = df_main.sample(frac=0.08, random_state=1)

In [None]:
email_subset.shape

(10106, 2)

In [None]:
df = email_subset.copy(deep = True)

In [None]:
df.head()

Unnamed: 0,Body,Folder
109126,"Seve,\n\nVicky worked for Risk Conferences in ...",'sent mail
62000,Who will get them? I am happy to sit down wit...,sent items
70989,"Pursuant to Frank Sayre's request, I am attach...",sent items
30536,what room?\nPL,sent
90425,"Lloyd/Rika,\n\nKevin has asked me to be the co...",sent items


In [None]:
df.shape

(10106, 2)

### Data Cleaning

In [None]:
# Removing forwarded by
def deal_forwarded(Body):
    condition = '[- ]*Forwarded by[\S\s]*Subject:[\S\t ]*'
    return re.sub(condition, '', Body).strip()
df['Body']= df.Body.map(deal_forwarded)

In [None]:
# Expanding the contractions
def expand_contractions(Body):
  column = []
  Apos_dict={"'s":" is","n't":" not","'m":" am","'ll":" will",
             "'d":" would","'ve":" have","'re":" are"}

  for file in df['Body']:
    for key,value in Apos_dict.items():
      if key in file:
        file = file.replace(key,value)
    column.append(file)

  return column

df['Body'] = expand_contractions([df['Body']])

In [None]:
# Removing email
def remove_email(body):
  column = []
  for file in df['Body']:
   pattern = re.compile(r'[a-zA-Z0-9._+-]+@[a-zA-Z0-9]+\.[a-zA-Z.]+')
   column.append(re.sub(pattern,' ',file))
  return column

df['Body'] = remove_email([df['Body']])

In [None]:
#Removing special characters
def remove_spec_characters(body):
  column = []
  for file in df['Body']:
   pattern = re.compile(r'(^\w\s)|(\W)')
   column.append(re.sub(pattern,' ',file))
  return column

df['Body'] = remove_spec_characters([df['Body']])

In [None]:
# Decapitalizing the body
df['Body'] = hero.preprocessing.lowercase(df['Body'])

In [None]:
#Removing url
df['Body'] = hero.preprocessing.remove_urls(df['Body'])

In [None]:
#Removing html tags
df['Body'] = hero.preprocessing.remove_html_tags(df['Body'])

In [None]:
#Removing brackets
df['Body'] = hero.preprocessing.remove_brackets(df['Body'])

In [None]:
# Removing digits
df['Body']  = hero.remove_digits(df['Body'])

In [None]:
# Removing whitespace
df['Body'] = hero.remove_whitespace(df['Body'])

In [None]:
df.tail()

Unnamed: 0,Body,Folder
82023,gary attached is the previously discussed gisb...,sent items
41465,that is fine just let me know when you want to...,sent
46007,sugarbooger looks like continental is our best...,sent
67193,madhup here is some more specific detail as pr...,sent items
87969,jorge please read paragraph of the termination...,sent items


### Data Pre-Processing

In [None]:
# Converting email body to list
data = df.Body.values.tolist()

In [None]:
# Tokenization
def tokenization(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [None]:
#Creating a list
data_words = list(tokenization(data))

In [None]:
# Removing stop words
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


data_words_nostops = remove_stopwords(data_words)

In [None]:
# Making bigrams
bigram = Phrases(data_words, min_count=5, threshold=100)

def make_bigrams(texts):
    return [bigram[doc] for doc in texts]


data_words_bigrams = make_bigrams(data_words_nostops)

In [None]:
# Lemmatization

# Initializing spaCy 'en' model, keeping only tagger component
nlp = spacy.load('en', disable=['parser', 'ner'])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Lemmatizing only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

### Model Building

In [None]:
# Creating dictionary and corpus
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

In [None]:
# Building Bag of words model
corpus = [id2word.doc2bow(text) for text in texts]

### Building Bag of Words LDA model

In [None]:
# Build LDA model
lda_model_1 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
pprint(lda_model_1.print_topics())
doc_lda = lda_model_1[corpus]

[(0,
  '0.029*"transaction" + 0.027*"option" + 0.025*"termination" + '
  '0.023*"product" + 0.020*"credit" + 0.017*"financial" + 0.013*"stop" + '
  '0.013*"approve" + 0.013*"trading" + 0.012*"average"'),
 (1,
  '0.043*"go" + 0.022*"get" + 0.018*"say" + 0.016*"come" + 0.016*"live" + '
  '0.015*"take" + 0.014*"good" + 0.013*"home" + 0.013*"really" + 0.010*"make"'),
 (2,
  '0.177*"send" + 0.159*"message" + 0.122*"original" + 0.037*"pm" + '
  '0.025*"mail" + 0.015*"may" + 0.013*"weekend" + 0.010*"offer" + '
  '0.009*"receive" + 0.008*"bill"'),
 (3,
  '0.042*"deal" + 0.032*"position" + 0.024*"gas" + 0.022*"change" + '
  '0.021*"confirmation" + 0.020*"report" + 0.016*"month" + 0.016*"new" + '
  '0.015*"day" + 0.013*"adjust"'),
 (4,
  '0.021*"agreement" + 0.021*"attach" + 0.019*"file" + 0.018*"information" + '
  '0.013*"receive" + 0.012*"regard" + 0.012*"copy" + 0.012*"document" + '
  '0.011*"contract" + 0.011*"mail"'),
 (5,
  '0.017*"plant" + 0.015*"employee" + 0.015*"issue" + 0.013*"custome

In [None]:
# Perplexity
print('Perplexity: ', lda_model_1.log_perplexity(corpus))

Perplexity:  -7.787832104704934


In [None]:
# Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model_1, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.4607617170825874


### Building Tf-idf LDA model

In [None]:
# Building Tf-idf LDA model

from gensim import corpora, models

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [None]:
lda_model_tfidf = gensim.models.ldamodel.LdaModel(corpus = corpus_tfidf,
                                                  id2word = id2word,
                                                  num_topics = 10,
                                                  random_state = 100,
                                                  chunksize = 100,
                                                  passes = 10,
                                                  per_word_topics = True)

In [None]:
pprint(lda_model_tfidf.print_topics())
doc_lda = lda_model_tfidf[corpus]

[(0,
  '0.017*"confirmation" + 0.016*"position" + 0.016*"ticket" + 0.010*"hold" + '
  '0.010*"right" + 0.009*"attend" + 0.008*"season" + 0.008*"sell" + '
  '0.008*"can" + 0.008*"move"'),
 (1,
  '0.013*"late" + 0.011*"easy" + 0.010*"stay" + 0.009*"page" + 0.008*"wo" + '
  '0.008*"stuff" + 0.008*"evening" + 0.007*"pretty" + 0.007*"catch" + '
  '0.007*"phone_fax"'),
 (2,
  '0.010*"go" + 0.009*"know" + 0.009*"send" + 0.009*"thank" + 0.009*"want" + '
  '0.009*"message" + 0.008*"let" + 0.008*"original" + 0.008*"take" + '
  '0.007*"would"'),
 (3,
  '0.013*"response" + 0.012*"never" + 0.011*"pdx" + 0.011*"chance" + '
  '0.009*"advise" + 0.008*"forget" + 0.007*"course" + 0.007*"different" + '
  '0.007*"broker" + 0.007*"handle"'),
 (4,
  '0.053*"minute" + 0.013*"volume" + 0.011*"show" + 0.011*"cell" + '
  '0.010*"manage" + 0.009*"individual" + 0.009*"memo" + 0.009*"speak" + '
  '0.008*"station" + 0.008*"handheld_www"'),
 (5,
  '0.016*"update" + 0.013*"approve" + 0.011*"plant" + 0.010*"average" +

In [None]:
# Perplexity
print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus_tfidf))


Perplexity:  -10.735637358827105


In [None]:
# Coherence Score
coherence_model_tfidf = CoherenceModel(model=lda_model_tfidf, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda_tfidf = coherence_model_tfidf.get_coherence()
print('\nCoherence Score: ', coherence_lda_tfidf)


Coherence Score:  0.3422078142666097
