### 融合Tfidf特征与Word2Vec特征进行文本表征


#### 读取文本数据

In [45]:
# 切换工作目录
import os
import sys
os.chdir(sys.path[1])

import pandas as pd
data_file = "data/train_data.csv"
data = pd.read_csv(data_file)
texts = data["text"].tolist()
print(f"texts len: {len(texts)}")
print(f"texts samples: {texts[:3]}")

texts len: 8718
texts samples: ['就是我干撒', '我想了一下可以的', '为什么要我还不还']


#### 构建词表，并使用tfidf进行文本表征

In [46]:
# 使用结巴分词
from jieba import lcut
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
cv = CountVectorizer(
    tokenizer=lcut,
    analyzer="word",
    min_df=2,
    max_df=0.5,
    token_pattern=r"(?u)\b\w+\b",
    # max_features=200,
    # ngram_range=(1, 2)
)
tt = TfidfTransformer()
count_vector_lst = cv.fit_transform(texts)
tfidf_vector_lst = tt.fit_transform(count_vector_lst)
print(f"vocabulary size: {len(cv.vocabulary_)}")
print(f"count_vector shape: {count_vector_lst.shape}")
print(f"tfidf_vector shape: {tfidf_vector_lst.shape}")
print(f"samples:")
for i in range(3):
    print(cv.inverse_transform(count_vector_lst[i:i+1]))
    print(f"count_vector[{i}]:\n{count_vector_lst[i:i+1]}")
    print(f"tfidf_vector[{i}]:\n{tfidf_vector_lst[i:i+1]}")
    # print(f"tfidf_vector_lst[{i}]:\n{tfidf_vector_lst[i:i+1].toarray()}")
tfidf_vector_mat = tfidf_vector_lst.toarray()
print(f"tfidf_vector_mat shape: {tfidf_vector_mat.shape}")

vocabulary size: 1376
count_vector shape: (8718, 1376)
tfidf_vector shape: (8718, 1376)
samples:
[array(['就是', '干', '我', '撒'], dtype='<U6')]
count_vector[0]:
  (0, 615)	1
  (0, 639)	1
  (0, 743)	1
  (0, 834)	1
tfidf_vector[0]:
  (0, 834)	0.6709030285312676
  (0, 743)	0.17848773142722263
  (0, 639)	0.5927760787201011
  (0, 615)	0.4082251542160986
[array(['一下', '了', '可以', '想', '我', '的'], dtype='<U6')]
count_vector[1]:
  (0, 43)	1
  (0, 189)	1
  (0, 413)	1
  (0, 721)	1
  (0, 743)	1
  (0, 1041)	1
tfidf_vector[1]:
  (0, 1041)	0.291936102756941
  (0, 743)	0.24022903956964178
  (0, 721)	0.5805006212919459
  (0, 413)	0.4209627626052835
  (0, 189)	0.2944391785381247
  (0, 43)	0.5061405652343262
[array(['不', '为什么', '我', '要', '还'], dtype='<U6')]
count_vector[2]:
  (0, 107)	1
  (0, 173)	1
  (0, 743)	1
  (0, 1155)	1
  (0, 1253)	2
tfidf_vector[2]:
  (0, 1253)	0.7515948508668725
  (0, 1155)	0.41331267980299924
  (0, 743)	0.17956048874066058
  (0, 173)	0.4075070205777472
  (0, 107)	0.2568537870609066


#### 使用词向量进行文本表征

In [47]:
# 加载词向量文件
from gensim.models.keyedvectors import load_word2vec_format
wv_file = "data/light_Tencent_AILab_ChineseEmbedding.bin"
wv = load_word2vec_format(wv_file, binary=True)

import numpy as np
tokens_lst = [list(lcut(text)) for text in texts]
wv_vector_lst = []
for tokens in tokens_lst:
    lst = [wv.get_vector(token) for token in tokens if token in wv]
    if len(lst) <= 0:
        wv_vector = np.zeros(200,)
    else:
        wv_vector = np.asarray(lst).mean(axis=0)
    wv_vector_lst.append(wv_vector)
wv_vector_mat = np.asarray(wv_vector_lst)
print(f"wv_vector_mat shape: {wv_vector_mat.shape}")

wv_vector_mat shape: (8718, 200)


#### KMeans聚类

In [48]:
# 拼接Tfidf和Word Embedding
vector_mat = np.concatenate([tfidf_vector_mat, wv_vector_mat], axis=1)
print(f"vector_mat shape: {vector_mat.shape}")

from sklearn.cluster import MiniBatchKMeans
km = MiniBatchKMeans(n_clusters=200)
result = km.fit_predict(vector_mat)
print(f"result: {result}")
lst = list(zip(result, texts))
lst.sort(key=lambda x:x[0])

# 随机挑选10个簇进行展示
import random
print("result sample:")
for idx in random.sample(range(200), 10):
    samples = [x for x in lst if x[0] == idx]
    print(f"{idx}th cluster show: ")
    for sample in samples:
        print(sample)
    print("="*50)

vector_mat shape: (8718, 1576)
result: [  9 137 136 ...  25 197 151]
result sample:
66th cluster show: 
(66, '费是如何算计算的')
(66, '保险费费用')
(66, '咋算为什么费用')
(66, '为何费用怎么说')
(66, '可怕费用高不高')
(66, '费用有多高')
(66, '费用怎么说')
(66, '费用太多了')
(66, '费用有889527多少')
(66, '费用怎么算')
(66, '费是怎么计算的')
(66, '你们计算费是怎么的的')
(66, '咋算的费用')
(66, '如何费用算')
(66, '费是如何算的')
(66, '费用多！少')
(66, '费用是怎么计算的')
(66, '手续费费用咋算')
(66, '的是怎么计算费用')
(66, '费用左右多吗')
(66, '费用违约金太多了')
(66, '多少费用')
(66, '费用怎么收')
(66, '费用手续费怎么收')
(66, '费用是不是很高')
(66, '怎么计算费用')
(66, '费用太高了')
(66, '？的费用有？多少')
(66, '怎么费用弄')
(66, '费是多少')
(66, '费用是。咋算的')
(66, '费用')
(66, '费用是多少')
(66, '费用是怎么组成的')
(66, '如何算的费用')
(66, '费是的计算怎么')
(66, '之后承担的费用是多少')
(66, '之后产生的费用是多少')
(66, '费用多不多')
(66, '费用怎么弄')
(66, '费用有多少')
(66, '费用是咋算七费八费的')
(66, '费用保险费怎么算')
(66, '的费用有多少')
(66, '的咋算费用')
(66, '费用高吗')
(66, '费用多吗')
(66, '费用如何计算')
(66, '的费用有会员费多少')
(66, '：怎么算费用')
(66, '费用怎么收函')
(66, '吗高费用')
(66, '费用怎；么收')
(66, '费用是！多少')
(66, '如何算可否的费用')
(66, '费用咋算')
(66, '！费减免一点')
(66, '怎么算费用')
(66, '费用。