In [13]:
import numpy as np 
import pandas as pd

import sys

from gsdmm import MovieGroupProcess

from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora, models
from gensim.utils import simple_preprocess
import gensim, spacy
from gensim.models.ldamulticore import LdaMulticore
import re


from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [16]:
df = pd.read_csv('preprocessed_data.csv')

In [17]:
df['text'] = df['content'].apply(lambda x:eval(x))

In [19]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def make_n_grams(texts):
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)  # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    bigrams_text = [bigram_mod[doc] for doc in texts]
    trigrams_text =  [trigram_mod[bigram_mod[doc]] for doc in bigrams_text]
    return trigrams_text

In [24]:
tokens = df.text.tolist()

In [25]:
tokens = make_n_grams(tokens)

In [26]:
tokens

[['คลั่งรัก', 'แฟน', 'ตัว', 'ร๊าก', 'นะ', 'เมีย', 'ข่อย'],
 ['แม่', 'สุขภาพ', 'แข็ง', 'รํ่า', 'รวย', 'เงินทอง'],
 ['คะแนน', 'ดี', 'ดี', 'ชีวิต', 'ออฟ', 'แฟน'],
 ['ง่วงนอน'],
 ['สนุกสนาน_ลอยกระทงเอย'],
 ['ติดตาม', 'สอบ', 'ติด', 'ประจักษ์'],
 ['ลุงตู่', 'พ่อแม่', 'รัก', 'สุข', 'รว', 'ชาติ'],
 ['หนาว', 'ขี้เกียจ', 'อาบ', 'น้ำ', 'อากาศ_ร้อน', 'ดี', 'คน', 'เพี้ยน'],
 ['ร้อน', 'ตอน', 'อากาศ_ร้อน'],
 ['ผู้', 'คลิป', 'ฟีด', 'สาธุ', '10', '0K'],
 ['คุ้ย', 'เสื้อผ้า', 'งด', 'คุ้ย', 'ตู้', 'เย็น', 'เอิ้ก_ๆๆ'],
 ['ทำ', 'สะอาด', 'เสื้อผ้า', 'ระเบียบ', 'อาทิตย์', 'ก้ลอย', 'กระทง'],
 ['สิง', 'อนาก'],
 ['เเฟน', 'ดาราเกาหลี'],
 ['แฟน', 'ยย', 'สาธุ'],
 ['ลุงลา', 'ตาย'],
 ['ศิลปิน',
  'ระดับ',
  'โลก',
  'ภาษา_อังกฤษ',
  'เกาหลีไทย',
  'จีน',
  'ญี่ปุ่น',
  'โรค',
  'ภัย',
  'คน',
  'ครอบ',
  'คนัว',
  'สุข'],
 ['โอ๊ย', 'กิน', 'ข้าว', 'เฟส', 'กิน', 'ข้าวเนตล่ม', 'เอิ้ก_ๆๆ'],
 ['คน', 'สุข', 'แข็งแรง'],
 ['ร่ำรวย', 'เงิน', 'ถุง', 'เงิน', 'ถัง', 'ภาษา_อังกฤษ'],
 ['ร่ำรวย', 'ติดตาม'],
 ['ไหล',
  'เหมือน',
  

In [27]:
np.random.seed(0)

In [30]:
mgp = MovieGroupProcess(K=6, alpha=0.01, beta=0.01, n_iters=30)

vocab = set(x for review in tokens for x in review)
n_terms = len(vocab)
model = mgp.fit(tokens, n_terms)

In stage 0: transferred 354600 clusters with 6 clusters populated
In stage 1: transferred 295413 clusters with 6 clusters populated
In stage 2: transferred 181775 clusters with 6 clusters populated
In stage 3: transferred 124162 clusters with 6 clusters populated
In stage 4: transferred 102752 clusters with 6 clusters populated
In stage 5: transferred 94124 clusters with 6 clusters populated
In stage 6: transferred 90515 clusters with 6 clusters populated
In stage 7: transferred 87512 clusters with 6 clusters populated
In stage 8: transferred 86592 clusters with 6 clusters populated
In stage 9: transferred 85553 clusters with 6 clusters populated
In stage 10: transferred 84774 clusters with 6 clusters populated
In stage 11: transferred 84533 clusters with 6 clusters populated
In stage 12: transferred 84216 clusters with 6 clusters populated
In stage 13: transferred 83544 clusters with 6 clusters populated
In stage 14: transferred 82780 clusters with 6 clusters populated
In stage 15: tr

In [31]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster,sort_dicts))

In [32]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('\nMost important clusters (by number of docs inside):', top_index)
# show the top 5 words in term frequency for each cluster 
top_words(mgp.cluster_word_distribution, top_index, 10)

Number of documents per topic : [ 56230  49832  51773 131626 109186  45242]

Most important clusters (by number of docs inside): [3 4 0 2 1 5]

Cluster 3 : [('ดี', 92194), ('สุข', 66692), ('คน', 56847), ('รัก', 44261), ('เจอ', 38397), ('สุขภาพ', 35247), ('แข็งแรง', 30905), ('ครอบครัว', 26548), ('ชีวิต', 26520), ('สาธุ', 16861)]

Cluster 4 : [('รัก', 25978), ('คน', 24107), ('รวย', 20222), ('ดี', 19718), ('สุข', 18357), ('แฟน', 11623), ('สาธุ', 11327), ('ปี', 8274), ('เงิน', 8101), ('ชอบ', 7242)]

Cluster 0 : [('แข็งแรง', 28142), ('สุขภาพ', 28108), ('ภัย', 24359), ('ดี', 22931), ('โรค', 22832), ('สุข', 20584), ('กาย', 19271), ('ใจ', 17541), ('ไร้', 16641), ('โศก', 14529)]

Cluster 2 : [('\u200b', 22299), ('ดี', 20202), ('สอบ', 18409), ('รัก', 15589), ('โชค', 14000), ('เรียน', 11821), ('เรื่อง', 11482), ('สุข', 11289), ('ทำ', 11121), ('การงาน', 10788)]

Cluster 1 : [('เงิน', 83793), ('ร่ำรวย', 43408), ('ถัง', 38169), ('ถุง', 37680), ('ล่ำซำ', 33970), ('ขัดสน', 27060), ('ดี', 15259), ('สุข

In [33]:
# I don`t rename the clusters

topic_dict = {}
topic_names = ['type 1',
               'type 2',
               'type 3',
               'type 4',
               'type 5',
               'type 6',
              ]
for i, topic_num in enumerate(top_index):
    topic_dict[topic_num]=topic_names[i]