In [1]:
import re
import nltk
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
import warnings
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
warnings.filterwarnings("ignore",category=DeprecationWarning)
%matplotlib inline

#https://na80.lightning.force.com/lightning/r/Report/00O1P000003mYjbUAE/view



In [2]:
df = pd.read_csv('august_data.csv')
df.Description.head()

0    Chat transcript\n\n\t\t\tE-mail: bistro@tohu.c...
1    Chat transcript\n\n\t\t\tName: Rebekkah Damian...
2    Chat transcript\n\n\t\t\tE-mail: jaycen@me.com...
3    Chat transcript\n\n\t\t\tE-mail: mario@elevati...
4    Chat transcript\n\n\t\t\tE-mail: opnaples@came...
Name: Description, dtype: object

In [3]:
data = df.Description.values.tolist()
data = [re.sub('\S*@\S*\s?', "", sent) for sent in data]
data = [re.sub('\s+', " ", sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]
data = [re.sub('https:\S+',"", sent) for sent in data]

pprint(data[:1])

['Chat transcript E-mail: Victoria Fri, 08/27/21 12:54:54 pm America/Toronto '
 'BevSpot Support here! How can we help? Bistro Tohu 12:55:16 pm hi i would '
 'like to import invoices can you activate the addon? Victoria 12:56:14 pm '
 'Hello - Thank you for reaching out! Are you referring to invoice processing? '
 'Bistro Tohu 12:56:25 pm i guess i want to import csv and pdf for creating '
 'items Victoria 01:02:13 pm Ok... Just to clarify, these are going to be '
 'invoiced from a vendor, and thats what youre looking to enter into the '
 'account? Or do you have a list of items on a file that we can upload through '
 'the items section? Just double-checking because currently, invoice '
 'processing is an add-on feature that is not included within the freemium '
 'program. Bistro Tohu 01:02:55 pm from invoice i m ok to pay Victoria '
 '01:04:12 pm Ok, thank you for this information. Hang tight for a quick '
 'moment while I reach out to your account manager. Bistro Tohu 01:04:52 pm '
 

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
data_words = list(sent_to_words(data))

In [5]:
bigram = gensim.models.Phrases(data, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [6]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
             if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [7]:
data_words_nostops = remove_stopwords(data)
data_words_bigrams = make_bigrams(data_words_nostops)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [8]:
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

In [9]:
#no touch
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [10]:
#score
print('\nPerplexity: ', lda_model.log_perplexity(corpus))
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.176994217240875

Coherence Score:  0.4154599827415786


In [11]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='mmds')
vis