In [1]:
from utils import Dataset, Preprocessing
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import gensim
from gensim import corpora
from wordcloud import WordCloud
import matplotlib.colors as mcolors
from nltk.corpus import stopwords

dutch_stop_words = stopwords.words('dutch')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zaheerbabar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data_obj = Dataset() 

# get labelled data
train_df = data_obj.get_labelled_data()
process_obj = Preprocessing()

# preprocessing includes, text cleaning and label encoding
cleaned_train_df = process_obj.get_preprocessd_data(train_df, text_column="phrase")

# we need tokenized docs for GSDMM
tokenized_train_docs = process_obj.get_tokenized(cleaned_train_df)

train_vocab = set(x for doc in tokenized_train_docs for x in doc)

print(tokenized_train_docs)
# get labelled data
test_df = data_obj.get_unlabelled_data()
process_obj = Preprocessing()

cleaned_test_df = process_obj.get_preprocessd_data(test_df, text_column="user_msg", has_labels = False)

# we need tokenized docs for GSDMM
tokenized_test_docs = process_obj.get_tokenized(cleaned_test_df)

test_vocab = set(x for doc in tokenized_test_docs for x in doc)

[['iphone'], ['iphone', 'beschikbaar'], ['komen', 'iphone'], ['samsung', 'verkopen'], ['google', 'pixel'], ['iphone', 'xs', 'beschikbaar'], ['toestel', 'assortiment'], ['iphone', 'beschikbaar'], ['iphone', 'beschikbaar'], ['telefoon', 'aanbieden'], ['iphone', 'bestellen'], ['verkoop', 'samsung', 'note'], ['iphone', 'plus', 'beschikbaar'], ['verkopen', 'samsung', 'galaxy', 's20'], ['iphone', 'bestellen'], ['telefoon', 'bieden'], ['telefoon', 'beschikbaar'], ['verkopen', 'iphone'], ['iphone', 'beschikbaarheid'], ['verkoop', 'nieuw', 'toestel'], ['samsung', 'galaxy', 'amsung', 's20', 'assortiment'], ['toestel', 'beschikbaar'], ['samsung', 's20', 'beschikbaar'], ['iphone', 'beschikbaar'], ['toestel'], ['nieuw', 'telefoon', 'beschikbaar'], ['iphone', 'mini', 'beschikbaar'], ['iphone', 'verkopen'], ['beschikbaarheid', 'toestel'], ['beschikbaarheid', 'iphone', 'se'], ['nieuw', 'pixel', 'koop'], ['iphone', 'se', 'beschikbaar'], ['iphone', 'xs', 'beschikbaar'], ['iphonexs', 'beschikbaar'], ['ve

#### GSDMM

In [3]:
from gsdmm import MovieGroupProcess

In [4]:
# Initialize and fit GSDMM model on labeled docs to check how it works as compared to the previous annotations. Let's start a lower number let's say 8 or 15 clusters

mgp = MovieGroupProcess(K=14, alpha=0.1, beta=0.1, n_iters=30)
y = mgp.fit(tokenized_train_docs, vocab_size=len(train_vocab))

In stage 0: transferred 1319 clusters with 14 clusters populated
In stage 1: transferred 882 clusters with 14 clusters populated
In stage 2: transferred 602 clusters with 14 clusters populated
In stage 3: transferred 438 clusters with 14 clusters populated
In stage 4: transferred 405 clusters with 14 clusters populated
In stage 5: transferred 377 clusters with 14 clusters populated
In stage 6: transferred 355 clusters with 14 clusters populated
In stage 7: transferred 314 clusters with 14 clusters populated
In stage 8: transferred 301 clusters with 14 clusters populated
In stage 9: transferred 312 clusters with 14 clusters populated
In stage 10: transferred 292 clusters with 14 clusters populated
In stage 11: transferred 275 clusters with 14 clusters populated
In stage 12: transferred 273 clusters with 14 clusters populated
In stage 13: transferred 276 clusters with 14 clusters populated
In stage 14: transferred 290 clusters with 14 clusters populated
In stage 15: transferred 262 clust

In [5]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic: {}'.format(doc_count))


print('Number of topics: {}'.format(len(np.unique(y))))

Number of documents per topic: [134 226  36  92  40 206  80 132 100 130 171  83  95   0]
Number of topics: 13


In [6]:
top_index = doc_count.argsort()[-10:][::-1]
for i, topic_num in enumerate(top_index):
    print('\nTopic {}:'.format(i))
    for word in mgp.cluster_word_distribution[topic_num]:
        print(word)


Topic 0:
telefoon
abonnemenen
g
Mobiel
abonnement
maandelijks
opzeggen
opzegbaar
verlagen
veranderen
contract
aanpassen
verlengen
vernieuw
huidig
ready
abbo
abbonoment
upgraden
datum
kunnen
abbonement
wijzig
nieuw
staan
unlimited
zitten
opzegg
sim
only
verlenging
tussentijds
verlopen
maand
dochter
kinder
databundel
straks
lopen
abo
toestelbetaling
data
willen
aanpass
downgrad
samsung
afbetalen
aankoopsbewijs
toevoeg
nodig
toegang
mn
krijgen
service
simonly
thuis
contractduur
zeggen
abonement
zoeken
opzegbaa
doorlopen
abonememt
eenmalig
mobiel
zakelijk
tv
abonnoment
internet
tijdelijk
besteling
zojuist
bestlen
Go
voortijdig
hoog
verhogen
ongedaan
maken
wijzigen
internetsnelheid
zaterdag
laten
upgrade
factuurmailing
aflopen
renew
optie
vernieuwen
hulp

Topic 1:
heel
bellen
Mobiel
internet
langzaam
g
traag
verbinding
thuis
werken
echt
goed
telefoon
netwerk
gaan
snelheid
tmobile
informatie
internetverbinding
moballen
last
onbeperken
Location
snel
info
weten
blijven
mobiel
factuur
abonneme

In [7]:
mgp = MovieGroupProcess(K=30, alpha=0.1, beta=0.1, n_iters=30)
y = mgp.fit(tokenized_test_docs, vocab_size=len(test_vocab))

doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic: {}'.format(doc_count))

top_index = doc_count.argsort()[-10:][::-1]


In stage 0: transferred 3527 clusters with 30 clusters populated
In stage 1: transferred 2661 clusters with 30 clusters populated
In stage 2: transferred 1865 clusters with 30 clusters populated
In stage 3: transferred 1383 clusters with 30 clusters populated
In stage 4: transferred 1156 clusters with 28 clusters populated
In stage 5: transferred 997 clusters with 26 clusters populated
In stage 6: transferred 941 clusters with 25 clusters populated
In stage 7: transferred 892 clusters with 24 clusters populated
In stage 8: transferred 876 clusters with 23 clusters populated
In stage 9: transferred 866 clusters with 23 clusters populated
In stage 10: transferred 817 clusters with 22 clusters populated
In stage 11: transferred 831 clusters with 21 clusters populated
In stage 12: transferred 816 clusters with 22 clusters populated
In stage 13: transferred 798 clusters with 22 clusters populated
In stage 14: transferred 780 clusters with 21 clusters populated
In stage 15: transferred 747 c

In [8]:
for i, topic_num in enumerate(top_index):
    print('\nTopic {}:'.format(i))
    for word in mgp.cluster_word_distribution[topic_num]:
        print(word)


Topic 0:
g
thuis
locatie
internet
krijgen
extra
graag
abonnement
veranderen
abonnemenen
wijzig
abbonement
aanpassen
Hallo
activeren
werken
wifi
buitenrouter
toestel
beschikbaar
nieuw
hoi
weten
verleng
zien
zetten
bestellen
vraag
snel
k
router


klacht
installatie
buitenunit
goedemorgen
zitten
adres
abo
klantvoordeel
tv
vast
zakelijk
gebruiken
instelling
  
kosten
Mobiel
aansluiten
mijn
informatie
telkens
monteur
unlimited
kapot
meenemen
verhuizen
gaan
staan
gmail
afsluiten
goedendag
t
mobile
inzien
bundel
zet
abbonnement
abonement
goed
tussentijds
nergens
pakket
gb
overstappen
televisie
qr
code
contract
geldig
langzaam
abonament
maandelijks
prijs
lopen
vragen
onbeperken
wijzegen
toevoeg
internetsnelheid
goedemiddag
omzetten
kost
wijziging
data
opbellen
I
doen
helpen
telefoon
vallen
betalen
account
laten
abbo
verbinden
plus
buitengebied


 
mbs
leeg
morgen
naam
simkaart
gran
buitwn
gaat
binnenkomen
maal
wachtwoord
verkeerd
invoeren
bedoelen
Mijm
terug
berijk
zenderpakket
batterij
vul
v

#### LDA
Now lets try LDA itself

In [9]:
Lda = gensim.models.ldamodel.LdaModel

def extract_topic(corpus, num_topics=10, n_words=10):
    
    clean_corpus=corpus.str.split(' ').tolist()
    
    dict_ = corpora.Dictionary(clean_corpus)
    
    doc_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]

    ldamodel = Lda(doc_term_matrix, num_topics=num_topics, id2word = dict_, passes=1, random_state=0, eval_every=None,iterations=100)

    return ldamodel,ldamodel.print_topics(num_topics=num_topics, num_words=n_words),ldamodel.show_topics(formatted=False),doc_term_matrix

def cloud_generate(topics,size=5):
    
    cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

    cloud = WordCloud(stopwords=dutch_stop_words,
                      background_color='white',
                      width=2500,
                      height=1800,
                      max_words=10,
                      colormap='tab10',
                      color_func=lambda *args, **kwargs: cols[i],
                      prefer_horizontal=1.0)

    fig, axes = plt.subplots(2,size, figsize=(10,10), sharex=True, sharey=True)

    for i, ax in enumerate(axes.flatten()):
        fig.add_subplot(ax)
        topic_words = dict(topics[i][1])
        cloud.generate_from_frequencies(topic_words, max_font_size=300)
        plt.gca().imshow(cloud)
        plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
        plt.gca().axis('off')


    plt.subplots_adjust(wspace=0, hspace=0)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()
    plt.show()


In [10]:
ldamodel,show_topics,topics, corpus=extract_topic(cleaned_test_df['cleaned_text'],25,10)

In [11]:
print(show_topics)

[(0, '0.137*"abonnemenen" + 0.120*"wijzig" + 0.041*"aanpassen" + 0.032*"staan" + 0.026*"factuur" + 0.025*"internet" + 0.022*"laat" + 0.021*"activeren" + 0.019*"abonnement" + 0.018*"app"'), (1, '0.047*"graag" + 0.043*"betalen" + 0.042*"internet" + 0.041*"thuis" + 0.037*"abonnemenen" + 0.034*"wifi" + 0.032*"ideal" + 0.028*"g" + 0.024*"factuur" + 0.024*"link"'), (2, '0.254*"g" + 0.167*"thuis" + 0.025*"nummer" + 0.024*"abonnemenen" + 0.017*"internet" + 0.015*"locatie" + 0.015*"vraag" + 0.011*"krijgen" + 0.011*"t" + 0.011*"extra"'), (3, '0.082*"activeren" + 0.037*"abonnement" + 0.034*"krijgen" + 0.031*"gb" + 0.022*"sms" + 0.020*"geven" + 0.018*"netwerk" + 0.017*"per" + 0.016*"abonnemenen" + 0.016*"beschikbaar"'), (4, '0.075*"factuur" + 0.069*"ideal" + 0.043*"link" + 0.043*"betalen" + 0.042*"abonnemenen" + 0.035*"verbinding" + 0.035*"sturen" + 0.030*"wijzig" + 0.023*"via" + 0.023*"nieuw"'), (5, '0.048*"nummer" + 0.041*"laat" + 0.036*"ideal" + 0.035*"account" + 0.033*"bellen" + 0.032*"rekenin

In [12]:
topics

[(13,
  [('factuur', 0.06294772),
   ('afbetalen', 0.04414381),
   ('laat', 0.039929796),
   ('activeren', 0.038840424),
   ('deezer', 0.029068412),
   ('\n', 0.028835662),
   ('toestelkrediet', 0.028588157),
   ('graag', 0.01982737),
   ('extra', 0.017631622),
   ('keer', 0.017594466)]),
 (11,
  [('internet', 0.09530639),
   ('locatie', 0.08968945),
   ('bundel', 0.050451055),
   ('toestel', 0.040639922),
   ('mb', 0.022601888),
   ('verhogen', 0.019520653),
   ('iphone', 0.019510698),
   ('afbetelen', 0.019173242),
   ('aflossen', 0.017753042),
   ('gebruiken', 0.016827421)]),
 (4,
  [('factuur', 0.07514506),
   ('ideal', 0.06940462),
   ('link', 0.042815235),
   ('betalen', 0.04280702),
   ('abonnemenen', 0.04245677),
   ('verbinding', 0.03547632),
   ('sturen', 0.035296768),
   ('wijzig', 0.029542074),
   ('via', 0.023364438),
   ('nieuw', 0.022786384)]),
 (12,
  [('factuur', 0.0717208),
   ('netwerk', 0.04756687),
   ('abonnement', 0.02862928),
   ('via', 0.028044993),
   ('krijge