In [34]:
import numpy as np
import pandas as pd
from tqdm import trange
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score, silhouette_samples
import jieba.analyse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
# import matplotlib.cm as cm
# from sklearn.decomposition import PCA

In [35]:
# 数据清洗：加载数据、添加自定义词典、分词、去停用词和标点等
data = [jieba.lcut(str(i).strip('\n')) for i in open('data/muchong_questions.txt', 'r', encoding='utf-8').readlines()]
cn_stopwords = [line.strip('\n') for line in open('stopwords/hit_stopwords.txt', 'r', encoding='utf-8').readlines()]
for idx, sentence in enumerate(data):
    data[idx] = ' '.join([word for word in sentence if word not in cn_stopwords])
# TODO:user_dict
# data[0:5]  # 看一下数据长啥样

In [36]:
'''基于TF-IDF的文本向量化表示'''
vectorizer = TfidfVectorizer(min_df=3, max_features=20000, encoding='latin-1')
X = vectorizer.fit_transform(data)
# 文本向量降维
svd = TruncatedSVD(2)  #设置降维后的维度
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
# 看一下处理后数据的维度
X.shape

(12861, 2)

In [37]:
# 用轮廓系数绘制学习曲线找出最优类别数，轮廓系数越大越好
class_silscore = []
for n_clusters in trange(2, 10):  # 设置可接受的类别数量范围，如2类到9类
    # fig, (ax1, ax2) = plt.subplots(1, 2)
    # fig.set_size_inches(18, 7)
    # ax1.set_xlim([-0.1, 1])
    # ax1.set_ylim([0, X.shape[0] + (n_clusters + 1) * 10])
    clusters = KMeans(n_clusters=n_clusters, init='k-means++').fit(X)
    cluster_labels = clusters.labels_
    silhouette_avg = silhouette_score(X, cluster_labels)
    class_silscore.append([n_clusters, silhouette_avg])
    # sample_silhouette_values = silhouette_samples(X, cluster_labels)
df_class_silscore = pd.DataFrame(class_silscore, columns=['num_clusters', 'avg_silhouette_score'])
df_class_silscore

100%|██████████| 8/8 [00:18<00:00,  2.34s/it]


Unnamed: 0,num_clusters,avg_silhouette_score
0,2,0.676559
1,3,0.645244
2,4,0.675485
3,5,0.639797
4,6,0.609622
5,7,0.582649
6,8,0.554621
7,9,0.569523


In [38]:
'''使用最优类别数进行聚类'''
max_silscore_idx = df_class_silscore['avg_silhouette_score'].idxmax()  # 获得最大轮廓系数对应的最优类簇数
best_k = df_class_silscore['num_clusters'].iloc[max_silscore_idx]
num_cluster = best_k  # 填入上一步输出的最大平均轮廓系数对应的类别数
best_clusters = KMeans(n_clusters=num_cluster, random_state=10).fit(X)  # 获得最优聚类结果
labels = best_clusters.labels_.tolist()  #获得每条原始文本聚类后归入的类别标签
df_clusters = pd.concat([pd.DataFrame(labels, columns=['cluster_id']), pd.DataFrame(data, columns=['text'])], axis=1)
df_clusters.to_excel('TF-IDF-clustered_data.xlsx')  # 输出每个文本及其聚类后归入的类别

In [39]:
'''获得每个类簇的包含的原始文本'''
cluster_text = {}
for i in range(num_cluster):
    cluster_text[i] = []
    for j in np.where(best_clusters.labels_ == i)[0].tolist():
        cluster_text[i] += data[j]  # TODO:完善提取关键词的流程，如去停用词、 同义词归一化等。
    cluster_text[i] = ''.join(cluster_text[i])

In [40]:
'''输出每个类簇中基于TF-IDF的关键词'''
# vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b')
# transformer = TfidfTransformer() # norm=None, smooth_idf=False
# x = transformer.fit_transform(vectorizer.fit_transform(cluster_text[i] for i in range(num_cluster)))
# print(x)
for i in range(num_cluster):
    keywords = jieba.analyse.extract_tags(cluster_text[i], topK=100, allowPOS=())
    with open('keywords_cluster_{}.txt'.format(i), 'w', encoding='utf-8') as f:
        for j in keywords:
            f.write(j + '\n')