In [31]:
import os
import re
import nltk
import jieba
import joblib
import string
import codecs
import unicodedata

import numpy as np
import pandas as pd
import plotly.express as px
#import matplotlib as mpl
#import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from itertools import chain
from functools import partial

from nltk.tag import pos_tag
from nltk.stem.snowball import SnowballStemmer

from sklearn.manifold import MDS
from sklearn.cluster import KMeans
from sklearn import feature_extraction
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.cluster.hierarchy import ward
from scipy.cluster.hierarchy import dendrogram

from gensim import corpora
from gensim import models
from gensim import similarities 

In [32]:
pd.set_option('display.max_rows', 10**4)

In [33]:
%%html
<style>
table {float:left}
</style>

In [34]:
random_state = 42

# Data

## load

In [35]:
path = './data'
# filename = 'comment.taptap.sgz2017-20210127-1-labeled.csv'
filename = 'comment.taptap-20210127-1.csv'
df = pd.read_csv(os.path.join(path, filename))

In [36]:
df = df[df.game=='率土之滨']

In [37]:
df.shape

(1816, 2)

In [38]:
df.head(1)

Unnamed: 0,game,text
9562,率土之滨,"1.s赛季节奏越来越快，氪金体验太好了，可惜氪不动了，慢慢跟不上现在高战氪金要求了,2.出卡..."


In [39]:
# df = df.head(1000)

In [40]:
texts = df.text

# Preprocess

In [41]:
def normalize(s):
    s = s.lower()
    s = unicodedata.normalize('NFKC', s)
    return s

In [42]:
texts = [normalize(text) for text in texts]

## single seq

In [43]:
def to_seq(s):
    return re.split(r'[,，.。?!]+', s)

In [44]:
texts = [to_seq(text) for text in texts]

In [45]:
texts = list(chain.from_iterable(texts))

## tokenize

In [46]:
%time texts = [jieba.lcut(text) for text in texts]

CPU times: user 1.12 s, sys: 28 ms, total: 1.14 s
Wall time: 1.14 s


## filter

## vocab

In [47]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=1, no_above=0.5)

In [48]:
corpus = [dictionary.doc2bow(text) for text in texts]
len(corpus)

19689

# Embedding

## n-dim

In [49]:
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.125, 
    max_features=200000,
    min_df=0.01, 
    # stop_words='english',
    use_idf=True, 
    # tokenizer=jieba.lcut, 
    # ngram_range=(1,3)
)

In [50]:
%time tfidf_matrix = tfidf_vectorizer.fit_transform(map(lambda t: ' '.join(t), texts))

CPU times: user 136 ms, sys: 0 ns, total: 136 ms
Wall time: 135 ms


In [51]:
print(tfidf_matrix.shape)

(19689, 26)


## n-dim distance

## 2-dim

# Clustering

In [52]:
num_clusters = 40

## k-means

In [53]:
# TODO: may be need normalize/scaling before clustering
km = KMeans(n_clusters=num_clusters, random_state=random_state)

In [54]:
%time km.fit(tfidf_matrix)

CPU times: user 10.1 s, sys: 408 ms, total: 10.5 s
Wall time: 1.4 s


KMeans(n_clusters=40, random_state=42)

In [55]:
len(texts)

19689

In [56]:
len(km.labels_)

19689

## latent dirichlet allocation

In [57]:
%time lda = models.LdaModel(corpus, \
                            num_topics=num_clusters, \
                            id2word=dictionary, \
                            update_every=5, \
                            chunksize=10000, \
                            passes=100, \
                            random_state=random_state)

print(lda[corpus[0]])

CPU times: user 10min 16s, sys: 584 ms, total: 10min 16s
Wall time: 10min 15s
[(0, 0.012500032), (1, 0.012500032), (2, 0.012500032), (3, 0.012500032), (4, 0.012500032), (5, 0.5124988), (6, 0.012500032), (7, 0.012500032), (8, 0.012500032), (9, 0.012500032), (10, 0.012500032), (11, 0.012500032), (12, 0.012500032), (13, 0.012500032), (14, 0.012500032), (15, 0.012500032), (16, 0.012500032), (17, 0.012500032), (18, 0.012500032), (19, 0.012500032), (20, 0.012500032), (21, 0.012500032), (22, 0.012500032), (23, 0.012500032), (24, 0.012500032), (25, 0.012500032), (26, 0.012500032), (27, 0.012500032), (28, 0.012500032), (29, 0.012500032), (30, 0.012500032), (31, 0.012500032), (32, 0.012500032), (33, 0.012500032), (34, 0.012500032), (35, 0.012500032), (36, 0.012500032), (37, 0.012500032), (38, 0.012500032), (39, 0.012500032)]


In [58]:
num_clusters

40

In [59]:
topics_matrix = lda.show_topics(formatted=False, num_words=20)
topics_matrix[:1]

[(28,
  [('没有', 0.15500683),
   ('什么', 0.100864895),
   ('都', 0.04893336),
   ('了', 0.043656353),
   ('一次', 0.024116391),
   ('一定', 0.0207657),
   ('刘备', 0.019426854),
   ('武将', 0.019101677),
   ('一个', 0.019079909),
   ('就', 0.018692683),
   ('连', 0.014489682),
   ('可是', 0.0144046275),
   ('送', 0.014333076),
   ('打开', 0.009671928),
   ('弄', 0.008652052),
   ('五', 0.008449307),
   ('吕蒙', 0.007013277),
   ('我', 0.0065737986),
   ('张', 0.00639404),
   ('弃游', 0.0060341824)])]

# Report

In [60]:
def cluster_topics(i, k, cluster_centers_, terms):
    order_centroids = cluster_centers_.argsort()[:, ::-1]
    return [terms[ind].encode().decode('utf-8', 'ignore') for ind in order_centroids[i, :k]]

In [61]:
def get_cluster_names(k, cluster_centers_, terms):
    return [','.join(cluster_topics(i, k, cluster_centers_, terms)) for i in range(len(cluster_centers_))]

## k-means

In [62]:
k = 10

In [63]:
clusters = km.labels_.tolist()
cluster_centers_ = km.cluster_centers_
terms = tfidf_vectorizer.get_feature_names()

cluster_names = get_cluster_names(k, cluster_centers_, terms)
cluster_names

['这个,还是,不是,五星,什么,但是,可以,因为,大佬,宝物',
 '游戏,就是,网易,真的,宝物,什么,系统,赛季,因为,率土',
 '玩家,宝物,就是,赛季,还是,战法,武将,游戏,不是,五星',
 '现在,赛季,游戏,宝物,就是,玩家,一个,这个,系统,真的',
 '没有,宝物,玩家,这个,赛季,还是,战法,就是,不是,五星',
 '赛季,游戏,玩家,宝物,没有,因为,五星,武将,什么,可以',
 '策划,玩家,就是,游戏,这个,系统,赛季,宝物,一个,真的',
 '就是,游戏,一个,玩家,赛季,宝物,这个,没有,不是,真的',
 '系统,宝物,这个,现在,游戏,真的,策划,一个,什么,率土',
 '率土,游戏,玩家,现在,还是,就是,一个,这个,宝物,可以',
 '但是,还是,游戏,现在,真的,宝物,这个,可以,玩家,赛季',
 '一个,游戏,策划,率土,真的,就是,可以,宝物,网易,玩家',
 '网易,游戏,策划,玩家,现在,一个,可以,这个,没有,宝物',
 '还是,策划,现在,玩家,网易,这个,可以,赛季,宝物,一个',
 '真的,游戏,策划,网易,这个,宝物,现在,玩家,一个,系统',
 '自己,游戏,玩家,就是,可以,还是,什么,现在,没有,赛季',
 '平民,玩家,大佬,一个,可以,宝物,游戏,真的,系统,赛季',
 '武将,宝物,可以,一个,没有,策划,就是,赛季,游戏,这个',
 '这个,游戏,真的,就是,玩家,可以,策划,率土,赛季,一个',
 '可以,游戏,玩家,赛季,一个,宝物,没有,现在,策划,这个',
 '宝物,游戏,可以,平民,大佬,没有,这个,什么,武将,玩家',
 '因为,游戏,这个,就是,宝物,率土,什么,大佬,玩家,没有',
 '战法,武将,五星,赛季,可以,就是,但是,什么,一个,游戏',
 '大佬,就是,赛季,一个,可以,没有,现在,但是,宝物,游戏',
 '什么,游戏,策划,玩家,网易,宝物,武将,可以,但是,现在',
 '一个,宝物,不是,大佬,就是,赛季,平民,但是,可以,武将',
 '不是,游戏,一个,玩家,率土,这个,宝物,大佬,但是,就是',
 '五星,武将,游戏,就是,真的,现在,一个,赛季,没有,玩家',
 '系统,一个,赛季,玩家,游戏,平民,不是,五星,什么,但是',
 '现在,没有,策划,玩家,

## latent dirichlet allocation

In [64]:
clusters = list(map(lambda x: sorted(x, key=lambda e: e[1], reverse=True)[0][0], lda[corpus]))

In [65]:
terms = list(set([e[0] for c in topics_matrix for e in c[1]]))
values = [{e[0]:e[1] for e in c[1]} for c in topics_matrix]
cluster_centers_ = np.array([[values[i].get(k,0.) for k in terms] for i in range(len(values))])

In [66]:
cluster_names = get_cluster_names(k, cluster_centers_, terms)
cluster_names

['没有,什么,都,了,一次,一定,刘备,武将,一个,就',
 '啊,了,太,核心,平衡,逼,点,影响,操作,失望',
 '开始,的,从,了,玩法,韭菜,以后,东西,评分,一',
 ' ,了,的,我,都,就,然后,还,网易,是',
 '要,肝,又,时间,抽卡,在,恶心,花,但是,你',
 '赛季,征服,到,了,大,的,用,一个,也,越来越',
 '—,�,把玩,家当,开服,:,哪个,分割线,奥,计算',
 '队伍,一点,这次,的,时,了,劝退,长,不好,有',
 '不,知道,了,也,充钱,可能,都,不想,对,我',
 '我,让,的,希望,是,了,你,别,才,下来']