In [None]:
from sklearn.datasets import fetch_20newsgroups

import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
categories = [
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
#读取数据
df = fetch_20newsgroups(subset='all',categories=categories, shuffle=False, remove=('headers', 'footers', 'quotes'))

In [None]:
labels = df.target
true_k = len(np.unique(labels)) ## This should be 3 in this example
print(true_k)

In [None]:
print(df.data[1])

In [None]:
'''
这是开始提取特征，这里的特征是词频统计。
'''
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df.data)
'''
这是开始提取特征，这里的特征是TFIDF特征。
'''
from sklearn.feature_extraction.text import TfidfTransformer # type: ignore
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


# def TfidTran (doc):
#     input = doc
#     count_vect = CountVectorizer()
#     X_train_counts = count_vect.fit_transform(input)
#     tfidf_transformer = TfidfTransformer()
#     X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
#     return X_train_tfidf

# X_vec = TfidTran(df.data)
# print(X_vec)

In [None]:
#print(X_vec)

In [None]:
from sklearn.linear_model import LogisticRegression  # 逻辑回归
clf = LogisticRegression().fit(X_train_tfidf, df.target)
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, df.target_names[category]))

In [None]:
#使用kmeans
from time import time
from sklearn.cluster import KMeans

km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100)
t0 = time()
km.fit(X_train_tfidf)
print("done in %0.3fs" % (time() - t0))


In [None]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X_train_tfidf, km.labels_, sample_size=1000))

Homogeneity (纯度): 表示每个簇中样本的标签尽可能一致。分数值越高，说明簇内样本的同质性越好。
Completeness (完备性): 表示每个 ground truth 标签的样本尽可能都分配到同一个簇。分数值越高，说明 ground truth 标签的样本没有被划分到多个簇中。
V-measure (V 度量): 是 Homogeneity 和 Completeness 的调和平均值，综合衡量了这两个指标。
Adjusted Rand-Index (调整兰德指数): 考虑了随机分簇的性能，用来评估聚类结果和 ground truth 标签之间的相似性。分数值越高，说明聚类结果越接近 ground truth。
Silhouette Coefficient (轮廓系数): 衡量样本到所属簇的相似度和到其他簇的异样度。分数值越高，说明样本被分配到正确的簇中。


In [None]:
docs_new = ['i love moon', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = km.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, df.target_names[category]))

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [49]:
# 分析每个聚类中的重要术语
centroids = km.cluster_centers_.argsort()[:, ::-1]
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df.data)
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0: the to of and is that in you it for
Cluster 1: the to for thanks and any of in me or
Cluster 2: the to of it and is you that in for
