In [1]:
%pylab inline

import pandas as pd
import numpy as np
import google_trans_new
import nltk
import re

#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
nltk.download('wordnet')

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

from gensim.models import Phrases
from gensim.corpora import Dictionary

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 


from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

from google_trans_new import google_translator
translator = google_translator()

from yellowbrick.cluster.elbow import kelbow_visualizer

from scipy import sparse as sp

Populating the interactive namespace from numpy and matplotlib


[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_excel('/home/jovyan/work/data/b2b_csat/monthly_csat.xlsx')

for i,text_lines in enumerate(df['NPS Reason']):
    df.loc[i,'nps_reason_eng'] = translator.translate(text_lines, lang_src='ar', lang_tgt='en')
   # print(df.loc[i,'nps_reason_eng'])

### Basic EDA


In [3]:
df.head(3)

Unnamed: 0,Company/Institution/Government Name,Name,Mobile Number,Month,Segmnet,NPS,NPS Category,NPS Reason,Overall CSAT,Product creation CSAT,...,Fixed,Mobility,Data,Managed Services,Computing,Cybersecurity,Big Data,IoT,PTT,nps_reason_eng
0,Royal Protocol,ياسر العتيبي,500154342,Jan,GOV,10,Promotor,شركة وطنيه في المقام الأول ، وخدمت مميزة على م...,1.0,1.0,...,1,1,1,1,0,0,0,0,0,A national company in the first place and serv...
1,Saudi Credit and Savings Bank,فهد القرني,500525086,Jan,GOV,9,Promotor,تعامل راقي ، ودعم فني مميز ، وجودة خدمات مميزة,0.953333,,...,1,0,1,1,0,0,0,0,0,"Ultrasonic treatment, special technical suppor..."
2,Technical and Vocational Training Corporation,Mohamed Almana,504101753,Jan,GOV,8,Passive,جودة خدمات لاتجدها في الشركات المنافسة ، ولكن ...,0.95,1.0,...,1,0,1,1,0,1,0,0,0,Quality of services do not find in competitive...


In [4]:
df.rename({'NPS Category':'nps_cat'},axis=1,inplace=True)
df_dist = df.groupby(['Month','nps_cat']).agg({'nps_cat':'count'})
df_dist.rename({'nps_cat':'nps_count'},axis=1,inplace=True)
df_dist.reset_index(inplace=True)
df_dist

Unnamed: 0,Month,nps_cat,nps_count
0,Apr,Detractor,5
1,Apr,Passive,13
2,Apr,Promotor,42
3,Feb,Detractor,7
4,Feb,Passive,17
5,Feb,Promotor,38
6,Jan,Detractor,4
7,Jan,Passive,8
8,Jan,Promotor,48
9,Mar,Detractor,11


### Text Analysis 

In [5]:
p_df = df['nps_reason_eng']
p_df

0      A national company in the first place and serv...
1      Ultrasonic treatment, special technical suppor...
2      Quality of services do not find in competitive...
3      Mimiz Communications Company that all services...
4      Wonderful services, speed in the Internet desp...
                             ...                        
237                There are no comments from customers 
238                There are no comments from customers 
239    Please modify the mandate We are currently ren...
240    The application of portal has two explanations...
241                There are no comments from customers 
Name: nps_reason_eng, Length: 242, dtype: object

In [6]:
docs = array(p_df)
docs

array(['A national company in the first place and served a distinctive safety and quality ',
       'Ultrasonic treatment, special technical support, and quality of special services ',
       'Quality of services do not find in competitive companies, but prices are slightly high in electronic circuit packages ',
       'Mimiz Communications Company that all services and products are permanently available ',
       'Wonderful services, speed in the Internet despite having some slow peak times, but the company is generally special in its services, products and deals ',
       "Coverage of the network is bad in some border places for Saudi Arabia's outlets with other countries, but as great and distinctive products. ",
       'Excellent sales team, quality and excellent services ',
       'Excellent dealing from everyone ',
       'Quality of service, attention and excellent dealing from everyone ',
       'Excellent services ', 'There are no comments from customers ',
       'Service of 

### Pre-process and vectorize the documents

In [7]:
def docs_preprocessor(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 3] for doc in docs]
    
    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
  
    return docs

In [8]:
docs = docs_preprocessor(docs)

### Compute bigrams/trigrams: 

In [9]:
# Add bigrams and trigrams to docs (only ones that appear 10 times or more).
bigram = Phrases(docs, min_count=10)
trigram = Phrases(bigram[docs])

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)
    for token in trigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

###  Remove rare and common tokens

In [10]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
print('Number of unique words in initital documents:', len(dictionary))

# Filter out words that occur less than 10 documents, or more than 20% of the documents.
dictionary.filter_extremes(no_below=10, no_above=0.2)
print('Number of unique words after removing rare and common words:', len(dictionary))

Number of unique words in initital documents: 571
Number of unique words after removing rare and common words: 28


### Vectorize data 

In [11]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 28
Number of documents: 242


### Train LDA model 

In [34]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 4
chunksize = 500 # size of the doc looked at every pass
passes = 20 # number of passes through documents
iterations = 400
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)


CPU times: user 1.97 s, sys: 6.31 ms, total: 1.98 s
Wall time: 1.97 s


### Number of Topics 

In [35]:
pyLDAvis.gensim_models.prepare(model, corpus, dictionary)

In [14]:
from pprint import pprint# Print the Keyword in the 10 topics
pprint(model.print_topics())
doc_lda = model[corpus]

[(0,
  '0.201*"account_manager" + 0.141*"with" + 0.092*"account" + 0.076*"manager" '
  '+ 0.074*"that" + 0.069*"been" + 0.067*"reported" + 0.061*"client" + '
  '0.050*"reported_that" + 0.041*"request"'),
 (1,
  '0.393*"been" + 0.271*"problem" + 0.175*"client" + 0.047*"time" + '
  '0.037*"email" + 0.004*"number" + 0.003*"will" + 0.003*"sent" + '
  '0.003*"account" + 0.003*"circuit"'),
 (2,
  '0.184*"product" + 0.168*"service_product" + 0.094*"that" + 0.090*"company" '
  '+ 0.086*"reported" + 0.086*"reported_that" + 0.081*"client" + '
  '0.071*"quality" + 0.066*"circuit" + 0.043*"with"'),
 (3,
  '0.242*"distinctive" + 0.235*"company" + 0.106*"with" + 0.099*"very" + '
  '0.075*"that" + 0.060*"client" + 0.058*"manager" + 0.029*"network" + '
  '0.026*"will" + 0.026*"been"'),
 (4,
  '0.269*"excellent" + 0.207*"coverage" + 0.190*"network" + 0.160*"business" + '
  '0.137*"sector" + 0.002*"very" + 0.002*"with" + 0.002*"reported" + '
  '0.002*"reported_that" + 0.002*"that"'),
 (5,
  '0.198*"sent

In [15]:
p_df = pd.DataFrame(p_df)
p_df

Unnamed: 0,nps_reason_eng
0,A national company in the first place and serv...
1,"Ultrasonic treatment, special technical suppor..."
2,Quality of services do not find in competitive...
3,Mimiz Communications Company that all services...
4,"Wonderful services, speed in the Internet desp..."
...,...
237,There are no comments from customers
238,There are no comments from customers
239,Please modify the mandate We are currently ren...
240,The application of portal has two explanations...


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

p_df['tokenz'] = docs

docs1 = p_df['tokenz'].apply(lambda l: l[:int0(len(l)/2)])
docs2 = p_df['tokenz'].apply(lambda l: l[int0(len(l)/2):])



In [17]:
p_df

Unnamed: 0,nps_reason_eng,tokenz
0,A national company in the first place and serv...,"[national, company, first, place, served, dist..."
1,"Ultrasonic treatment, special technical suppor...","[ultrasonic, treatment, special, technical, su..."
2,Quality of services do not find in competitive...,"[quality, service, find, competitive, company,..."
3,Mimiz Communications Company that all services...,"[mimiz, communication, company, that, service,..."
4,"Wonderful services, speed in the Internet desp...","[wonderful, service, speed, internet, despite,..."
...,...,...
237,There are no comments from customers,"[there, comment, from, customer, there_comment..."
238,There are no comments from customers,"[there, comment, from, customer, there_comment..."
239,Please modify the mandate We are currently ren...,"[please, modify, mandate, currently, renewed, ..."
240,The application of portal has two explanations...,"[application, portal, explanation, every, mont..."


In [18]:
corpus1 = [dictionary.doc2bow(doc) for doc in docs1]
corpus2 = [dictionary.doc2bow(doc) for doc in docs2]

# Using the corpus LDA model tranformation
lda_corpus1 = model[corpus1]
lda_corpus2 = model[corpus2]

In [19]:
from collections import OrderedDict
def get_doc_topic_dist(model, corpus, kwords=False):
    
    '''
    LDA transformation, for each doc only returns topics with non-zero weight
    This function makes a matrix transformation of docs in the topic space.
    '''
    top_dist =[]
    keys = []

    for d in corpus:
        tmp = {i:0 for i in range(num_topics)}
        tmp.update(dict(model[d]))
        vals = list(OrderedDict(tmp).values())
        top_dist += [array(vals)]
        if kwords:
            keys += [array(vals).argmax()]

    return array(top_dist), keys



In [20]:
top_dist1, _ = get_doc_topic_dist(model, lda_corpus1)
top_dist2, _ = get_doc_topic_dist(model, lda_corpus2)

print("Intra similarity: cosine similarity for corresponding parts of a doc(higher is better):")
print(mean([cosine_similarity(c1.reshape(1, -1), c2.reshape(1, -1))[0][0] for c1,c2 in zip(top_dist1, top_dist2)]))

random_pairs = np.random.randint(0, len(p_df['nps_reason_eng']), size=(400, 2))

print("Inter similarity: cosine similarity between random parts (lower is better):")
print(np.mean([cosine_similarity(top_dist1[i[0]].reshape(1, -1), top_dist2[i[1]].reshape(1, -1)) for i in random_pairs]))

Intra similarity: cosine similarity for corresponding parts of a doc(higher is better):
0.8892632
Inter similarity: cosine similarity between random parts (lower is better):
0.788369


In [21]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accept a ldamodel, atopic number and topn vocabs of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

In [22]:
topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(num_topics):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(model,topic_number=i, topn=10, output=True )
#     print tmp[:5]
    topic_summaries += [tmp[:5]]
    print

term                 frequency

Topic 0 |---------------------

account_manager      0.201
with                 0.141
account              0.092
manager              0.076
that                 0.074
been                 0.069
reported             0.067
client               0.061
reported_that        0.050
request              0.041
Topic 1 |---------------------

been                 0.393
problem              0.271
client               0.175
time                 0.047
email                0.037
number               0.004
will                 0.003
sent                 0.003
account              0.003
circuit              0.003
Topic 2 |---------------------

product              0.184
service_product      0.168
that                 0.094
company              0.090
reported             0.086
reported_that        0.086
client               0.081
quality              0.071
circuit              0.066
with                 0.043
Topic 3 |---------------------

distinctive          0.242
com

In [23]:
top_labels = {0: 'one', 1:'two', 2:'three', 3:'four'}

In [24]:
import re
import nltk

from nltk.corpus import stopwords

stops = set(stopwords.words('english'))

def paper_to_wordlist( paper, remove_stopwords=True ):
    '''
        Function converts text to a sequence of words,
        Returns a list of words.
    '''
    lemmatizer = WordNetLemmatizer()
    # 1. Remove non-letters
    paper_text = re.sub("[^a-zA-Z]"," ", paper)
    # 2. Convert words to lower case and split them
    words = paper_text.lower().split()
    # 3. Remove stop words
    words = [w for w in words if not w in stops]
    # 4. Remove short words
    words = [t for t in words if len(t) > 2]
    # 5. lemmatizing
    words = [nltk.stem.WordNetLemmatizer().lemmatize(t) for t in words]

    return(words)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvectorizer = TfidfVectorizer(input='content', analyzer = 'word', lowercase=True, stop_words='english',\
                                  tokenizer=paper_to_wordlist, ngram_range=(1, 3), min_df=40, max_df=0.20,\
                                  norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)

dtm = tvectorizer.fit_transform(p_df['nps_reason_eng']).toarray()



In [26]:
top_dist =[]
for d in corpus:
    tmp = {i:0 for i in range(num_topics)}
    tmp.update(dict(model[d]))
    vals = list(OrderedDict(tmp).values())
    top_dist += [array(vals)]

In [27]:
top_dist, lda_keys= get_doc_topic_dist(model, corpus, True)
features = tvectorizer.get_feature_names()

In [28]:
top_ws = []
for n in range(len(dtm)):
    inds = int0(argsort(dtm[n])[::-1][:4])
    tmp = [features[i] for i in inds]
    
    top_ws += [' '.join(tmp)]
    
p_df['Text_Rep'] = pd.DataFrame(top_ws)
p_df['clusters'] = pd.DataFrame(lda_keys)
p_df['clusters'].fillna(10, inplace=True)

#cluster_colors = {'Promotor': 'blue', 'Detractor': 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}

cluster_colors = {'Promotor': 'green', 'Detractor': 'red', 'Passive': 'yellow'}

p_df['colors'] = df['nps_cat'].apply(lambda l: cluster_colors[l])

In [29]:
p_df.groupby(['clusters','colors']).agg({'colors':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,colors
clusters,colors,Unnamed: 2_level_1
0,green,3
0,red,5
1,green,1
1,red,4
1,yellow,2
2,green,8
2,yellow,2
3,green,5
3,red,1
3,yellow,3


In [30]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(top_dist)

In [31]:
p_df['X_tsne'] =X_tsne[:, 0]
p_df['Y_tsne'] =X_tsne[:, 1]

In [32]:
from bokeh.plotting import figure, show, output_notebook, save#, output_file
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

In [37]:
source = ColumnDataSource(dict(
    x=p_df['X_tsne'],
    y=p_df['Y_tsne'],
    color=p_df['colors'],
    label=p_df['clusters'].apply(lambda l: top_labels[9]),
#     msize= p_df['marker_size'],
    topic_key= p_df['clusters'],
 #   title= p_df[u'Title'],
    content = p_df['Text_Rep']
))

KeyError: 9

In [None]:
title = 'T-SNE visualization of topics'

plot_lda = figure(plot_width=1000, plot_height=600
                   #  ,title=title
                  , tools="pan,wheel_zoom,box_zoom,reset,hover,save",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y', legend='label', source=source,
                 color='color', alpha=0.8, size=10)#'msize', )

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "Title: @title, KeyWords: @content - Topic: @topic_key "}
plot_lda.legend.location = "top_left"

show(plot_lda)

#save the plot
# save(plot_lda, '{}.html'.format(title))