In [1]:
# -*- coding: utf-8 -*-

import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
import os
current_dir = os.getcwd()
os.chdir(current_dir)

In [2]:
import pandas as pd 
df = pd.read_excel(r".\df_query_2_512B.xlsx")
df_0 = df[df['similarity']<=0.75]
df_0 = df_0.reset_index(drop=True)
df_ff = df_0['food_feature']
df_ff.to_csv(r'./模糊特征515.txt',index=False,header=None)

In [3]:

class KmeansClustering():
    def __init__(self, stopwords_path=None):
        self.stopwords = self.load_stopwords(stopwords_path)
        self.vectorizer = CountVectorizer()
        self.transformer = TfidfTransformer()

    def load_stopwords(self, stopwords=None):
        """
        加载停用词
        :param stopwords:
        :return:
        """
        if stopwords:
            with open(stopwords, 'r', encoding='utf-8') as f:
                return [line.strip() for line in f]
        else:
            return []

    def preprocess_data(self, corpus_path):
        """
        文本预处理，每行一个文本
        :param corpus_path:
        :return:
        """
        corpus = []
        with open(corpus_path, 'r', encoding='utf-8') as f:
            for line in f:
                corpus.append(' '.join([word for word in jieba.lcut(line.strip()) if word not in self.stopwords]))
        return corpus

    def get_text_tfidf_matrix(self, corpus):
        """
        获取tfidf矩阵
        :param corpus:
        :return:
        """
        tfidf = self.transformer.fit_transform(self.vectorizer.fit_transform(corpus))

        # 获取词袋中所有词语
        # words = self.vectorizer.get_feature_names()

        # 获取tfidf矩阵中权重
        weights = tfidf.toarray()
        return weights

    def kmeans(self, corpus_path, n_clusters):
        """
        KMeans文本聚类
        :param corpus_path: 语料路径（每行一篇）,文章id从0开始
        :param n_clusters: ：聚类类别数目
        :return: {cluster_id1:[text_id1, text_id2]}
        """
        corpus = self.preprocess_data(corpus_path)
        weights = self.get_text_tfidf_matrix(corpus)

        clf = KMeans(n_clusters=n_clusters)

        # clf.fit(weights)

        y = clf.fit_predict(weights)

        # 中心点
        # centers = clf.cluster_centers_

        # 用来评估簇的个数是否合适,距离约小说明簇分得越好,选取临界点的簇的个数
        # score = clf.inertia_

        # 每个样本所属的簇
        result = {}
        for text_idx, label_idx in enumerate(y):
            if label_idx not in result:
                result[label_idx] = [text_idx]
            else:
                result[label_idx].append(text_idx)
        return result


In [4]:
Kmeans = KmeansClustering()
result = Kmeans.kmeans(r'./模糊特征515.txt', n_clusters=15)
print(result)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\GUOFEN~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.441 seconds.
Prefix dict has been built successfully.


{3: [0, 1, 2, 4, 5, 6, 9, 13, 14, 15, 16, 18, 20, 22, 24, 26, 28, 32, 36, 37, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58, 59, 61, 62, 63, 64, 67, 68, 69, 72, 74, 75, 76, 77, 78, 79, 82, 83, 85, 86, 87, 88, 91, 92, 93, 94, 95, 96, 97, 98, 101, 102, 103, 104, 105, 107, 108, 110, 111, 114, 116, 117, 118, 120, 121, 122, 123], 4: [3, 8, 25, 31, 112], 7: [7, 21, 35, 42, 54, 60], 6: [10, 17, 34], 8: [11, 12], 1: [19, 29, 30, 43, 65, 66], 13: [23], 14: [27, 45], 0: [33, 38, 46], 2: [39, 40, 41, 44, 84, 90, 99, 109, 113, 115, 119], 5: [49, 71, 100, 124], 11: [70, 106], 9: [73], 10: [80, 89], 12: [81]}




In [5]:
df_ff = pd.read_csv(r'./模糊特征515.txt',names=['food_feature'])
df_ff['cluster_id'] = -1
# 遍历聚类结果字典，标记每行属于哪个簇
for cluster_id, indices in result.items():
    df_ff.loc[indices, 'cluster_id'] = cluster_id

In [6]:
df_0 = pd.concat([df_ff,df_0[['Query_faiss','recommend']]], axis=1)
df_0 = df_0[['food_feature','Query_faiss','recommend','cluster_id']].sort_values(by='cluster_id', ascending=True) 

In [7]:
df_0.to_excel(r'./聚类515.xlsx')