In [1]:
import os
import re
import nltk
import jieba
import joblib
import string
import codecs
import numpy as np
import pandas as pd
import plotly.express as px
#import matplotlib as mpl
#import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from itertools import chain
from functools import partial

from nltk.tag import pos_tag
from nltk.stem.snowball import SnowballStemmer

from sklearn.manifold import MDS
from sklearn.cluster import KMeans
from sklearn import feature_extraction
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.cluster.hierarchy import ward
from scipy.cluster.hierarchy import dendrogram

from gensim import corpora
from gensim import models
from gensim import similarities 

In [43]:
pd.set_option('display.max_rows', 10**4)

In [3]:
%%html
<style>
table {float:left}
</style>

In [4]:
random_state = 42

# Data

## load

In [5]:
path = '~/data/yk-sgz2017-chat/data-2-20201223'
# filename = '2020_09_25.csv'
# filename = '2020_09_24.csv'
filename = '2020_08_20.csv'
df = pd.read_csv(os.path.join(path, filename), index_col=0)

In [6]:
df.shape

(111636, 11)

In [7]:
df.head(1)

Unnamed: 0,gameid,userid,msec,serverid,roleid,pid,rolename,channel,content,timestamp,server
0,45,Tencent_54ACA9B9A0E3B75115F82191032136D0,1597852282,1,282303914329157,19,红軍丶柒柒,private_282303914329157_282303908677278,{localization:589-168},2020-08-20 00:00:02,sgz-mix-youxi-0033


In [8]:
# df = df.head(1000)

## tokenize

In [9]:
%time texts = [jieba.lcut(text) for text in df.content]

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.681 seconds.
Prefix dict has been built successfully.


CPU times: user 8.66 s, sys: 184 ms, total: 8.84 s
Wall time: 8.84 s


## filter

In [10]:
def get_stopwords(path):
    stopwords = []
    for filename in os.listdir(path):
        with open(os.path.join(path, filename), 'r') as f:
            stopwords.extend([w.strip() for w in f.readlines()])            
    return stopwords

In [11]:
stopwords = get_stopwords('/home/wangyh/project/document_cluster/dicts/')
print(len(stopwords))
stopwords[:10]

3885


['--', '?', '“', '”', '》', '－－', 'able', 'about', 'above', 'according']

In [12]:
%time texts = [[word for word in text if word not in stopwords] for text in texts]
print(len(texts[0]))

CPU times: user 39.4 s, sys: 84 ms, total: 39.5 s
Wall time: 39.5 s
4


In [13]:
%time texts = [[word for word in text if 1 < len(word)] for text in texts]
print(len(texts[0]))

CPU times: user 212 ms, sys: 8 ms, total: 220 ms
Wall time: 217 ms
3


## vocab

In [14]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=1, no_above=0.8)

In [15]:
corpus = [dictionary.doc2bow(text) for text in texts]
len(corpus)

111636

# Preprocess

# Embedding

## n-dim

In [16]:
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.8, 
    max_features=200000,
    min_df=0.01, 
    # stop_words='english',
    use_idf=True, 
    # tokenizer=jieba.lcut, 
    ngram_range=(1,3)
)

In [17]:
%time tfidf_matrix = tfidf_vectorizer.fit_transform(map(lambda t: ' '.join(t), texts))

CPU times: user 1.38 s, sys: 32 ms, total: 1.41 s
Wall time: 1.41 s


In [18]:
print(tfidf_matrix.shape)

(111636, 130)


## n-dim distance

## 2-dim

# Clustering

In [19]:
num_clusters = 10

## k-means

In [20]:
# TODO: may be need normalize/scaling before clustering
km = KMeans(n_clusters=num_clusters, random_state=random_state)

In [21]:
%time km.fit(tfidf_matrix)

CPU times: user 16.3 s, sys: 444 ms, total: 16.7 s
Wall time: 2.24 s


KMeans(n_clusters=10, random_state=42)

## latent dirichlet allocation

# Report

In [22]:
def cluster_topics(i, k, cluster_centers_, terms):
    order_centroids = cluster_centers_.argsort()[:, ::-1]
    return [terms[ind].encode().decode('utf-8', 'ignore') for ind in order_centroids[i, :k]]

In [23]:
def get_cluster_names(k, cluster_centers_, terms):
    return [','.join(cluster_topics(i, k, cluster_centers_, terms)) for i in range(len(cluster_centers_))]

In [24]:
def cluster_info(i, df, info):
    return df[df['cluster']==i][info].values.tolist()

In [25]:
def pprint(num_clusters, cluster_centers_, terms, df, k):

    print("Top terms per cluster:")
    print()

    for i in range(num_clusters):

        """ term import -> term vector
        """

        print("Cluster %d words:" % i, end='')
        
        for token in cluster_topics(i, k, cluster_centers_, terms):
            print(token, end=',')

        print()
        # print()

        """ cluster id -> title
        """

        print("cluster %d content:" % i)
        
        for i, title in enumerate(cluster_info(i, df, 'content')[:k]):
            # print(' %s,' % title, end='')
            print('[%s] %s' % (i, title))

        print()
        print()

In [26]:
def report_pprint(df, clusters, terms, cluster_centers_, num_clusters, k):
    """
    Parameter
    --------
    df : pd.DataFrame, shape=(n_samples, n_dim)
        
    clusters : list
    
    terms : vector feature name
    
    cluster_centers_ : np.array, shape=(n_clusters, n_terms)
    
    num_clusters : int
    """
    
    # add cluster info
    df['cluster'] = clusters
    print(df['cluster'].value_counts())
    
    # top tokens
    print(len(terms))
    print(cluster_centers_.shape)
    
    pprint(num_clusters, cluster_centers_, terms, df, k)

In [27]:
def visualizing(df_cluster):
    # df_cluster = df_cluster.astype({'label': 'int'}).astype({'label': 'str'})
    fig = px.scatter(df_cluster, x='x', y='y', color='label', hover_data=['content'])
    fig.show()

In [28]:
def report_visualizing(xs, ys, clusters, cluster_names, df):
    df_cluster = pd.DataFrame(dict(x=xs, y=ys, label=[cluster_names[c] for c in clusters]))
    df_cluster['content'] = df['content'].apply(lambda x: x[:5])
    
    # print(df_cluster.shape)
    # print(df_cluster.head(1))

    visualizing(df_cluster)

In [29]:
def report(df, clusters, terms, cluster_centers_, num_clusters, xs, ys, cluster_names, k):
    report_pprint(df, clusters, terms, cluster_centers_, num_clusters, k)
    report_visualizing(xs, ys, clusters, cluster_names, df)

In [30]:
k = 5

## k-means

In [31]:
clusters = km.labels_.tolist()
cluster_centers_ = km.cluster_centers_
terms = tfidf_vectorizer.get_feature_names()
cluster_names = get_cluster_names(k, cluster_centers_, terms)
cluster_names

['兄弟,battle,蜀国,游戏,有意',
 '2222,魏国,内测 老人 带队,利用 作战 兄弟,利用 作战',
 'localization,兄弟,功勋,繁荣,蜀国',
 '轮子,轮子 女马,女马,轮子 女马 全家,女马 全家',
 '1111111,作战 兄弟 加微信,兄弟 加微信,兄弟 加微信 svip12126,全家',
 '轮子 女马 一条,女马 一条,一条,女马 一条 渣种,渣种',
 '更好,发展 资源 更好,攻略 内测 老人,群里 攻略,更好 利用',
 '2222222,魏国,内测 老人 带队,利用 作战 兄弟,利用 作战',
 '资源,加微信,蜀国,功勋,魏国',
 '高迁,功勋,私聊,吴国 高迁,吴国']

In [32]:
report_pprint(df, clusters, terms, cluster_centers_, num_clusters, k)

0    76802
2     7689
3     4656
8     4619
7     4454
9     4201
4     3505
5     2557
1     1633
6     1520
Name: cluster, dtype: int64
130
(10, 130)
Top terms per cluster:

Cluster 0 words:兄弟,battle,蜀国,游戏,有意,
cluster 0 content:
[0] 哈哈哈，看你这气急败坏的样子我就很舒坦
[1] 真tm舔够 jian 人都不配做
[2] 嘻嘻，你可保持好心态，别气的死太早呀
[3] 还有24小时 各位施主
[4] 御驾，早死早超生，放心的去吧


Cluster 1 words:2222,魏国,内测 老人 带队,利用 作战 兄弟,利用 作战,
cluster 1 content:
[0] 2222
[1] 2222
[2] 2222
[3] 2222
[4] 2222


Cluster 2 words:localization,兄弟,功勋,繁荣,蜀国,
cluster 2 content:
[0] {localization:589-168}
[1] 施主 再偷一次？苍穹不出手那种 {localization:926-516}
[2] 施主 单挑烽火啊 拿建业 {localization:926-516}
[3] 亲征的那位 等你来征啊 {localization:926-516}
[4] {localization:336-820}


Cluster 3 words:轮子,轮子 女马,女马,轮子 女马 全家,女马 全家,
cluster 3 content:
[0] 二轮子死嗲又死女马，全家都是短命狗！
[1] 二轮子死嗲又死女马，全家都是短命狗！
[2] 二轮子死嗲又死女马，全家都是短命狗！
[3] 二轮子死嗲又死女马，全家都是短命狗！
[4] 二轮子死嗲又死女马，全家都是短命狗！


Cluster 4 words:1111111,作战 兄弟 加微信,兄弟 加微信,兄弟 加微信 svip12126,全家,
cluster 4 content:
[0] 1111111
[1] 1111111
[2] 1111111
[3] 1111111
[

## latent dirichlet allocation

# Data for classify

In [33]:
df['cluster'] = clusters

In [38]:
# df[df['cluster'] == 8]['content']

In [53]:
d = df['content'].value_counts().reset_index()

In [54]:
d = d.rename(columns={'content':'cnt'}).rename(columns={'index':'content'})

In [56]:
d[d['cnt'] > 1]['cnt'].sum() / d['cnt'].sum()

0.5068347128166542

In [49]:
d[d['cnt'] > 1]

Unnamed: 0,content,cnt
0,2222222,4454
1,二轮子死嗲又死女马，全家都是短命狗！,4029
2,1111111,3505
3,二轮子没嗲没女马，就是一条渣种狗！,2117
4,2222,1632
5,高价收吴国高迁号，有意的私聊，晚上九点统一回复！,1621
6,为了军团更好的发展，资源更好的利用，大家一起作战，兄弟加微信：svip12126，领礼包，群...,1516
7,上官海白 表字养的,1316
8,上官海白 彪子 养的,1210
9,出号，交易猫交易,855
