# Lab3: Clustering & Classification

## Data
`Twitter_data`: This file contains 29846 data,and each of them has 8 items
- "userName":用户名
- "clusterNo":类别
- "text":Twitter内容
- "timeStr":时间戳
- "tweeId":用户Id
- "errorCode":状态码
- "textCleaned":去除链接等特殊符号只保留文本的处理
- "relevance":

In [1]:
import json
# 将数据读取成dict格式便于后续的操作
Twitter_data=[]
with open("Twitter_data")as f:
    for line in f:
        # print(line)
        Twitter_data.append(json.loads(line))

##  Tokenize
直接按空格分词

In [6]:
token_textCleaned = [] 
words = set([]) 
for item in Twitter_data: 
    tokens = item["textCleaned"].split(" ")
    token_textCleaned.append(tokens) 
    for token in tokens:
        words.add(token) 

num_words_max = len(words) 

In [9]:
print(num_words_max)

12227


## Vectoring
Bow(Bag of Words) 词袋模型。

In [11]:
import numpy as np 

bow_dict = dict()
for i, word in enumerate(words):
    bow_dict[word] = i

vec_textCleaned = np.zeros((len(token_textCleaned), num_words_max)) 
for i, sentence in enumerate(token_textCleaned):
    for word in sentence:
        j = bow_dict[word] 
        vec_textCleaned[i][j] = 1

## K-Means
聚类文本时因为数据太大，会比较慢。

In [130]:
class KMeans():
    def __init__(self, data, num_classes, max_iter=200):
        self.num_classes = num_classes 
        self.src_data = data
        self.max_iter = max_iter
        self.m_examples, self.n_features = data.shape
        
        self.label = np.zeros(self.m_examples)
        self.clusters = [[] for i in range(num_classes)]  ## idx_list of each class in src_data 
        ## center vectors
        init_cen_idx = np.random.choice(self.m_examples, num_classes, replace=False)  ## init randomly first
        self.centroid = self.src_data[init_cen_idx]
    
    def run(self, threshold=1e-2):
        for _ in range(self.max_iter):
            print("cluster")
            self.clusters = [[] for i in range(self.num_classes)]
            self._cluster(self.centroid) 
            print("centroid")
            newCentroid = self._genCentroid(self.clusters)
            if self._edis(self.centroid, newCentroid) < threshold: 
                print("bbbbbreak")
                break 
            self.centroid = newCentroid 
        return self.label 
        
    def _cluster(self, centroid):
        for idx, sample in enumerate(self.src_data):
            lbl, dis = -1, float("inf")
            for cls in range(self.num_classes):
                tmp = np.sum((sample - centroid[cls])**2)
                if tmp < dis:
                    lbl = cls
                    dis = tmp
            self.label[idx] = lbl  # record the class for this sample
            self.clusters[lbl].append(idx)  # add this sample to the class 
            
    def _genCentroid(self, clusters):
        newCentroid = np.zeros((self.num_classes, self.n_features)) 
        for i, cluster in enumerate(clusters):
            cluster_mean = np.mean(self.src_data[cluster], axis=0)
            newCentroid[i] = cluster_mean
        return newCentroid 
    
    def _edis(self, cen1, cen2):
        return np.sum(np.sqrt(np.sum((cen1-cen2)**2, axis=1)))

In [132]:
# km = KMeans(np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0], [1, 0, 1], [0, 0, 1], [1, 1, 1]]), 3, 10) 
# lbl = km.run() 
# print(lbl)
km = KMeans(vec_textCleaned, 200, 3) 
lbl = km.run()

cluster
centroid


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


cluster
centroid
cluster
centroid


In [134]:
print(km.centroid)
print(lbl)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.01801802 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
[ 20.  20.  20. ...  20. 113. 166.]


## Test
思想：用kmeans聚类后的结果，与原本的分类情况比较。统计聚类之后，每一个类别中，原分类的情况。
1. 统计原分类，最多出现的分类的个数，作为这个聚类的类别
2. 计算比例：聚类类别中，属于这个类别的项 / 整个聚类类别的所有元素个数。将此作为正确比例
3. 求每一组聚类类别正确比例的平均值

In [140]:
print(Twitter_data[23522]["clusterNo"])
print(Twitter_data[23509]["clusterNo"])

344
344


In [148]:
def evaluation(data_dict, clusters):
    evl = np.zeros(len(clusters)) 
    for idx, clstr in enumerate(clusters):
        clsOrg = [Twitter_data[i]["clusterNo"] for i in clstr] 
        if len(clsOrg) > 0: 
            mainCls = max(clsOrg, key=clsOrg.count)  # 统计出现最多次的元素
            evl[idx] = clsOrg.count(mainCls) / len(clsOrg)
        else:
            evl[idx] = 0
    return evl 

In [150]:
evl = evaluation(Twitter_data, km.clusters)
print(np.mean(evl))

0.9471817805273113
