In [26]:
import pandas as pd
import numpy as np

import jieba
import jieba.posseg as pseg
import jieba.analyse

import sys
from optparse import OptionParser
import pprint
from prettytable import PrettyTable

In [27]:
df = pd.read_csv(open('../data/df_small.csv'))
df = df[df['text'].str.contains("https|Britain")== False]
print('有{}筆對話'.format(len(df)))

有91340筆對話


## groupby

In [28]:
grouped_data = df.groupby('from')['text'].apply(list).reset_index()
l = len(grouped_data)
print('有{}個使用者'.format(l))

有1019個使用者


## 先用前1000筆資料做測試

In [4]:
test = grouped_data.head(1000)
test_len = len(test)
test.head()

Unnamed: 0,from,text
0,1000970273334199,找不到阿/安安/你有打LOL嗎/好吧，可以帶我嗎/安安/凸/Sor/嗯/約哪/有啊/你先約的...
1,1001070263323689,!?/唐寶寶/你那麼愛慕他ㄇ/嗨/先問問您期望本人的性別是?/了解/我是男的/抱歉沒有/請問...
2,1001967239901904,早/在床上發懶/爆/斷/魂/你在暗諷綠藻嘛/壞/話說我好餓ㄛ/窩不要這個qq/ㄋ早餐吃什麼/...
3,1003172849781848,甲/你要做我的0？/人生勝利組是不會來玩這個的/你一定不是c銘/也不是業王/葉你妹/被盜/其...
4,1003929993039131,吃噓/肥宅早/我們不是場外人/場外ㄈㄓ/我沒用了/安安/是喔/場外ㄈㄓ嗎/豪/斷/欸/在打手...


In [29]:
print('id為 {} 的使用者傳送的訊息{}'.format(grouped_data.iloc[0][0],grouped_data.iloc[0][1]))

id為 1000970273334199 的使用者傳送的訊息['找不到阿', '安安', '你有打LOL嗎', '好吧，可以帶我嗎', '安安', '凸', 'Sor', '嗯', '約哪', '有啊', '你先約的啊', '喔', '輸入0088', '我懶的打了', '皮米屌', '這你嗎？', '魏癢癢', '哪裡人?', '住哪裡啦？', '超癢', '哪裡人?', '好遠', '其實還好', '新竹', '龍潭?', 'ㄏㄅㄨㄐㄅㄌㄗㄋ', '三小', '幹又ㄋ', '掰掰', '凹?', '凸sor']


In [30]:
stopwordset = set()
with open('jieba_txt/stop_words.txt','r',encoding='utf-8') as sw:
    for line in sw:
        stopwordset.add(line.strip('\n'))
# stopwordset

In [31]:
chars = set(' abcdefghijklmnopqrstuvwxyz0123456789()?!$,:.。，-_&=/􀆿\r\n')
for c in chars:
    stopwordset.add(c)

In [9]:
# stopwordset

## gensim model

In [32]:
from gensim import models
import gensim

In [33]:
sentences = models.word2vec.LineSentence("corpusSegDone.txt")
model = models.word2vec.Word2Vec(sentences, size=1, window=5, min_count=5, workers=1)
# model.wv.vocab



## pick user top 10 word ( frequency-based )

In [34]:
top10_vector = np.zeros(shape=(l,10))
from collections import defaultdict

# top10_vector

In [35]:
import operator
jieba.set_dictionary('jieba_txt/dict.txt.big.txt')
jieba.load_userdict("jieba_txt/userdict.txt")
for i in range(l):
#     print('\n user'+str(i+1))
    frequency = defaultdict(int)
    for j in range(len(grouped_data.iloc[i][1])):
        seglist = jieba.cut((grouped_data.iloc[i][1])[j])
        texts = [word for word in seglist if word not in stopwordset]
        for text in texts:
            frequency[text] += 1
    a = sorted(frequency.items(), key=operator.itemgetter(1),reverse=True)
#     print(a)
    
    row = np.zeros(shape=(1,10))
    
    r = 10 if len(a) >= 10 else len(a)
    for x in range(r):
        if a[x][0] in model:
            show = float(model[a[x][0]])
#           print(show)
            row[0][x] = show
#     print(row)
    top10_vector[i] = row

Building prefix dict from /home/tp6han/gp_new/jieba_txt/dict.txt.big.txt ...
DEBUG:jieba:Building prefix dict from /home/tp6han/gp_new/jieba_txt/dict.txt.big.txt ...
Loading model from cache /tmp/jieba.u0ddb98113e9461fac3a7658d9e24e45e.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.u0ddb98113e9461fac3a7658d9e24e45e.cache
Loading model cost 1.488 seconds.
DEBUG:jieba:Loading model cost 1.488 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


In [36]:
top10_vector.shape

(1019, 10)

In [37]:
top10_vector

array([[ 2.72570157,  2.68838716, -0.30927148, ...,  2.57240701,
         2.12417483,  0.84510767],
       [ 2.64373946,  2.41933203,  0.        , ...,  2.69715691,
         2.68919754,  2.6456933 ],
       [ 2.77911639,  2.63833618,  2.66791582, ...,  2.56085587,
         1.50895774,  2.64373946],
       ..., 
       [ 2.36991978,  0.        ,  0.        , ...,  2.43657541,
         0.        ,  0.        ],
       [ 3.4267695 ,  1.92322135,  1.2367388 , ...,  3.02721882,
         2.02429104,  2.54376841],
       [ 1.14984071,  2.46848321,  2.66791582, ...,  0.        ,
         2.38839889,  1.64640784]])

## 分群

In [42]:
from sklearn import cluster, datasets

# KMeans 演算法
kmeans_fit = cluster.KMeans(n_clusters = 3,random_state=222).fit(top10_vector)

# 印出分群結果
cluster_labels = kmeans_fit.labels_
print("分群結果 前10：")
print(cluster_labels[:10])
print("---")




分群結果 前10：
[1 1 1 1 1 1 1 2 1 2]
---


## 分群結果數量

In [44]:
unique, counts = np.unique(cluster_labels, return_counts=True)
dict(zip(unique, counts))


{0: 112, 1: 741, 2: 166}

In [45]:
grouped_data['category'] = cluster_labels

In [46]:
grouped_data.head(20)

Unnamed: 0,from,text,category
0,1000970273334199,"[找不到阿, 安安, 你有打LOL嗎, 好吧，可以帶我嗎, 安安, 凸, Sor, 嗯, 約...",1
1,1001070263323689,"[!?, 唐寶寶, 你那麼愛慕他ㄇ, 嗨, 先問問您期望本人的性別是?, 了解, 我是男的,...",1
2,1001967239901904,"[早, 在床上發懶, 爆, 斷, 魂, 你在暗諷綠藻嘛, 壞, 話說我好餓ㄛ, 窩不要這個q...",1
3,1003172849781848,"[甲, 你要做我的0？, 人生勝利組是不會來玩這個的, 你一定不是c銘, 也不是業王, 葉你...",1
4,1003929993039131,"[吃噓, 肥宅早, 我們不是場外人, 場外ㄈㄓ, 我沒用了, 安安, 是喔, 場外ㄈㄓ嗎, ...",1
5,1006942526092443,"[安, 是, 靠北哈哈, 是要說啥, 呃, 對呀, 好, 那我們來玩個遊戲, 1188-11...",1
6,1007352782715651,"[嗨, ......., 看來是沒有, 我是有差點想打, 哈哈, 真的離開喔, 想說多打一句...",1
7,1018989321532115,"[Hi, O.O, Ooo, 黃色那隻, 已知啊, Hi, 還沒睡啊, 不睡啊]",2
8,1020089304775462,"[嗨, 嗨, 斷, 嗨, 哪裡人, 挖, 國外耶, 天龍國人啊, 感覺不同世界, 嗨, 安安...",1
9,1020229148084642,[安安],2


In [20]:
aaa = grouped_data.groupby('category')['text'].apply(lambda x: '\n\n'.join(x)).reset_index()