In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.probability import FreqDist
import glob
import os
import time
import warnings

In [None]:
code_start = time.time()

In [None]:
nltk.data.path.append("C:\\Users\\gh68930\\nltk_data")
text_field = 'QQ_NPS_COMMENT'
score_field = 'QQ_KEYMETRIC_LTR'

In [None]:
def read_data(path):
    path = os.getwd() + path
    csv_files = glob.glob(os.path.join(path,"*.csv"))
    
    all_detractors = pd.DataFrame(columns=['comment'])
    for f in csv_files:
        df.pd.read_csv(f)
        df['npsscore'] = df.filter(regex=score_field).iloc[:,0].str.replace(r"[^0-9]","").astype('int')
        df = df[df['LanguageID']=='EN']
        df = df[df['npsscore']<=6].filter(regex=text_field)
        df = df.dropna()df.columns = ['comment']
        all_detractors = pd.concat([all_detractors, df], ignore_index = True)
    return all_detractors

In [None]:
df_train_raw = read_data("\\verbatim\\train")
df_validation_raw = read_data('\\verbatim\\test')

In [None]:
df_train = df_train_raw.copy(deep=True)
df_validation = df_validation_raw.copy(deep=True)

In [None]:
df_train["comment"] = df_train["comment"].str.replace(r"[!#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]","")
df_train["comment"] = df_train["comment"].str.replace(r"[^A-Za-z]", " ")
df_train["comment"] = df_train["comment"].str.lower()
df_train["comment"] = df_train["comment"].apply(lambda s: re.sub(' +', ' ', s))

In [None]:
df_validation["comment"] = df_validation["comment"].str.replace(r"[!#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]","")
df_validation["comment"] = df_validation["comment"].str.replace(r"[^A-Za-z]", " ")
df_validation["comment"] = df_validation["comment"].str.lower()
df_validation["comment"] = df_validation["comment"].apply(lambda s: re.sub(' +', ' ', s))

In [None]:
df_train["comment"] = df_train["comment"].apply(lambda s: re.sub('dept', 'department', s))
df_validation["comment"] = df_validation["comment"].apply(lambda s: re.sub('dept', 'department', s))

### Lemmatization

In [None]:
lemmatizer = nltk.WordNetLemmatizer()

In [None]:
def lemmatize_sentence(sentence):
    word_list = sentence.split(' ')
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w, wordnet.VERB) for w in word_list])
    return lemmatized_output

In [None]:
df_train["comment"] = df_train["comment"].apply(lemmatize_sentence)
df_validation["comment"] = df_validation["comment"].apply(lemmatize_sentence)

### Stopwords Removal

In [None]:
stopwords = stopwords.words('english')

In [None]:
df_train2["comment"] = df_train["comment"].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

In [None]:
df_validation2["comment"] = df_validation["comment"].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

### N-grams
#### Bi-grams

In [None]:
bigram_measures = nltk.collections.BigramAssocMeasures()
bi_finder = nltk.collections.BigramCollocationFinder.from_documents([comment.split() for comment in df_train2.comment])

In [None]:
bi_finder = apply_freq_filter(50)

In [None]:
bigram_scores = bi_finder.score_ngrams(bigram_measures.pmi)

In [None]:
bigrams = [' '.join(i[0]) for i in bigram_scores if i[1] >= 2]

#### Tri-grams

In [None]:
trigram_measures = nltk.collections.TrigramAssocMeasures()
tri_finder = nltk.collections.TrigramCollocationFinder.from_documents([comment.split() for comment in df_train2.comment])

In [None]:
tri_finder = apply_freq_filter(50)

In [None]:
trigram_scores = tri_finder.score_ngrams(trigram_measures.pmi)

In [None]:
trigrams = [' '.join(i[0]) for i in trigram_scores if i[1] >= 5]

#### Replace N-grams in data

In [None]:
def replace_ngrams(x):
    for gram in bigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    for gram in trigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    return x

In [None]:
df_train2.comment = df_train2.comment.map(lambda x:replace_ngrams(x))

In [None]:
df_validation2.comment = df_validation2.comment.map(lambda x:replace_ngrams(x))

### Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_df = 0.95, min_df = 100)

cv_train = cv.fit_transform(df_train2["comment"].tolist())
cv_train_feature_names = cv.get_feature_names()

cv_validation = cv.transform(df_validation2["comment"].tolist())

### LDA Model

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

no_topics = 15
hyper_alpha = 0.8
hyper_beta = 0.01
this_random_state = 53
start = time.time()
lda_detractors = LatentDirichletAllocation(n_components = no_topics,
                                          doc_topic_prior = hyper_alpha,
                                          topic_word_prior = hyper_beta,
                                          max_iter = 300,
                                          learning_method = 'batch',
                                          #learning_offset = 75,
                                          random_state = this_random_state).fit(cv_train)

In [None]:
print(lda_detractors.perplexity(cv_train))
print(lda_detractors.score(cv_train))

In [None]:
print(lda_detractors.perplexity(cv_validation))
print(lda_detractors.score(cv_validation))

#### Get topic correlations

In [None]:
predictions = lda_detractors.transform(X = cv_train)

In [None]:
corr_data = pd.DataFrame()
for num in range(no_topics):
    my_column_name = "Topic_" + str(num) + "_score"
    corr_data = corr_data.assign(**{my_column_name: predictions[:, num]})

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
corr = corr_data.corr()

sns.set(style = "white")
mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplot(figsize = (11,9))
sns.heatmap(corr, mask = mask, vmax = 0.3, center = 0,
           square = True, linewidths = 0.5, cbar_kws = {"shrink": 0.5})

plt.show()

#### Display topics

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic % d: " % (topic_idx), end = '')
        for i in topic.argsort()[: -no_top_words - 1:-1]:
            if int(topic[i]) > 1:
                print(feature_names[i], end = ', ')
        print('\n', end = '')
        
no_top_words = 25

display_topics(lda_detractors, cv_train_feature_names, no_top_words)

### Important words for Topics

In [None]:
df_words = pd.DataFrame(columns = ['Topic', 'Words'])

In [None]:
df_words['Topic'] = [i for i in range(no_topics)]
df_words['Words'] = df_words['Words'].astype('object')

In [None]:
for index, topic in enumerate(lda_detractors.components_):
    print(f"The top 15 words for Topic {index}")
    print([cv.get_feature_names()[i] for i in topic.argsort()[-20:]])
    df_words.at[index, 'Words'] = [cv.get_feature_names()[i] for i in topic_argsort()[-20:]]
    print('\n')

In [None]:
df_words

### Tagging Train & Validation topics

In [None]:
topic_results_train = lda_detractors.transform(cv_train)
print(topic_results_train.shape)
df_train_raw['Topic'] = topic_results_train.argmax(axis = 1)

In [None]:
topic_results_validation = lda_detractors.transform(cv_validation)
print(topic_results_validation.shape)
df_validation_raw['Topic'] = topic_results_validation.argmax(axis = 1)

### Saving the labeled data, Model and Vectorizer

In [None]:
writer = pd.ExcelWriter('LDA_detractors.xlsx', engine = 'xlsxwriter')
df_train_raw.to_excel(writer, 'Train')
df_validation_raw.to_excel(writer, 'Validation')
df_words.to_excel.to_excel(writer, 'Words')
writer.save()

In [None]:
import pickle
vectorizer_name = os.getcwd() + '\\verbatim\\detractor_vectorizer.pkl'
with open(vectorizer_name, 'wb') as file:
    pickle.dump(cv, file)
    
pkl_filenam = os.getcwd() + '\\verbatim\\lda_detractor_model.pkl'
with open(pkl_filenam, 'wb') as file:
    pickle.dump(lda_detractors, file)

In [None]:
code_end = time.time()
print((code_end - code_start)/60)