# 数据处理
使用bert模型对短文本数据进行embedding

In [None]:
import pandas as pd
import numpy as np
import matplotlib

# 读取标题数据
title_data = pd.read_csv("../data/title.csv") 

## 初步去重
- 简单dropna/drop_duplicates
- 保留长度小于512的数据
- 微博数据清洗：去除@xxx等等
- 正则表达式去除非汉字
- 最后只保留非空的处理数据

In [None]:
print("original data shape:",title_data.shape)

# 初步去重
title_data.dropna(axis=0,how='any')
unique_title_data = title_data.dropna(axis=0,how='any').drop_duplicates(subset='text')
print("drop_duplicates data shape:",unique_title_data.shape)
#unique_title_data["text"].str.len().hist(bins=200)

# 过滤特别长的一些数据
short_unique_title_data = unique_title_data[unique_title_data['text'].str.len()<512]
print("short drop_duplicates data shape:",short_unique_title_data.shape)
short_unique_title_data["text"].str.len().hist(bins=512)

# for idx in short_unique_title_data["text"].str.len().sort_values().index.tolist()[-100:]:
#     print(idx,short_unique_title_data["text"][idx])

In [None]:
from multiprocessing import Pool
from pandarallel import pandarallel
import os, time, random
from weibo_preprocess_toolkit import WeiboPreprocess
from joblib import Parallel, delayed

def text_preprocess(data):
    data.replace(' ','')
    return 
    
# 微博数据预处理
def data_preprocess(data):
    preprocess = WeiboPreprocess()
    start = time.time()
    clean_data = data['text'].parallel_map(preprocess.clean)
    end = time.time()
    print('Task runs %0.2f seconds.' %(end - start))
    return clean_data

if __name__=='__main__':
    pandarallel.initialize()
    psutd = short_unique_title_data.copy()
    psutd['clean'] = data_preprocess(psutd)
    
#     psutd['clean'] = psutd['clean'].parallel_map(replace(' ',''))
    

In [None]:
# 正则表达式只保留汉字
%%time
import re

# \s
psutd['clean'] = [re.sub("[^\u4e00-\u9fa5]",'',ctext) for ctext in psutd['clean'].tolist()]
psutd = psutd[psutd['clean'].str.len()>1]
psutd = psutd.drop_duplicates(subset='clean')
print("clean data shape:",psutd.shape)

下面是simhash文本去重环节
> 因为python计算这部分比较慢，所以没有继续

In [None]:
# 多进程结巴分词
%%time
import jieba
jieba.enable_parallel(8)
seg_list = [jieba.lcut(text) for text in psutd['clean']]

In [None]:
# 计算simhash值
%%time
from simhash import Simhash as SH
SH(seg_list[0]).value
simhash_list = [SH(seg) for seg in seg_list]

simhash矩阵python计算过于缓慢，之后可能考虑c++/cuda调用

In [None]:
# 过于缓慢

# %%time
# uset={}
# sim_list_len = len(simhash_list)
# flag_list = [range(sim_list_len)]
# pair_list = []
# for idx in range(sim_list_len):
#     for pair in range(idx,sim_list_len):
#         if (simhash_list[idx].distance(simhash_list[pair])<5):
#             pair_list.append((idx,pair))

## 数据分析
- 数值特征分析
- bert生成embedding
- 并查集分析&相似矩阵分析

In [None]:
psutd['clean'].str.len().hist(bins=512)
print(psutd['clean'].str.len().mean())
print(psutd['clean'].str.len().median())
print(psutd.iloc[0])

# for idx in psutd["clean"].str.len().sort_values().index.tolist()[-10:]:
#     print(idx,psutd["clean"][idx])

### 载入bert-as-service
这里选择的是google-bert-base模型，在命令行启动

In [None]:
import tensorflow as tf
print("TF version is",tf.__version__)
from bert_serving.client import BertClient
bc = BertClient()
# print(bc.encode(['First do it', '今天天气不错', 'then do it better']))

测试bert模型

In [None]:
# bert test
from sklearn.metrics.pairwise import pairwise_distances as PD
vec = bc.encode(['外交部召见美国驻华大使提出严正交涉敦促美方纠正错误停止利用涉港问题干涉中国内政中国外交部副部''今天天气不错今天天气不错今天天气不错今天天气不错今天天气不错今天天气不错','今天天气不错','亚洲球员在多重看这在上之后武磊二个赛季遭遇前所级区发机会 但是前 轮联赛颗粒无收 当然 这也与西甲联赛一属性有关 历史上能够真正立足西甲联赛的亚洲球员屈指可数 目前西甲联赛也只有中日韩 名球员效力 其馀三大亚洲球星更是只能委身西乙联赛 △目前 从西班牙职业联赛的亚洲球员看 日本球员还是占据主流 名国脚都在西甲或是西乙联赛效力 从球员基数看 日本球员整体适应能力确实了得 良好的职业态度和扎实的基本功 让他们在西班牙联','亚洲球员在西甲分量有多重在上赛季初试身手之后武磊在留洋西甲的第二个赛季遭遇前所未有的困难西班牙人队深陷降级区武磊虽然获得不少首发机会 但是前 轮联赛颗粒无收 当然 这也与西甲联赛一属性有关 历史上能够真正立足西甲联赛的亚洲球员屈指可数 目前西甲联赛也只有中日韩 名球员效力 其馀三大亚洲球星更是只能委身西乙联赛 △目前 从西班牙职业联赛的亚洲球员看 日本球员还是占据主流 名国脚都在西甲或是西乙联赛效力 从球员基数看 日本球员整体适应能力确实了得 良好的职业态度和扎实的基本功 让他们在西班牙联赛获'])

print(vec)
print(PD(vec,vec,n_jobs=8))
matplotlib.pyplot.matshow(ED(vec,vec))

调用bert-service服务计算，可能会花费10分钟甚至更久
> 300K数据，max_seq_len=64，双P40耗时10分钟左右

In [None]:
%%time
clean_vec = bc.encode(psutd["clean"].tolist())

In [None]:
print(clean_vec.shape)

将向量保存为二进制数据

In [None]:
with open("../data/hk_nodes",'wb') as bin_output:
    clean_vec.tofile(bin_output)

对全体向量进行二维PCA分析

In [None]:
from sklearn.decomposition import PCA
pca = PCA(2)
clean_pca2 = pca.fit_transform(clean_vec)
matplotlib.pyplot.scatter(clean_pca2[:,0],clean_pca2[:,1],alpha=0.2)

调用邻接边计算程序，同时得到并查集

In [None]:
%%time
node_num = clean_vec.shape[0]
node_dim = clean_vec.shape[1]
threshold = 18.0
os.system(' '.join(["cd ../Kluster; cd bin; ./linker ../data/hk_nodes ../data/hk_edges.csv",str(node_num),str(node_dim),str(threshold)]))
hk_edge = pd.read_csv("../Kluster/data/hk_edges..csv") 

In [None]:
hk_edge

In [None]:
hk_edge['distance'].hist(bins=200)

分析向量的相似程度

In [None]:
%%time
edm = PD(clean_vec[:1000],clean_vec[:1000],n_jobs=8)
print(edm)
matplotlib.pyplot.matshow(edm)

读取&分析并查集结果

In [None]:
def read_set(path):
    disjoint_set={}
    with open(path,'r') as set_file:
        set_lines = set_file.readlines()
    set_lines = set_lines[1:]
    for line in set_lines:
        line = line[:-2]
        set_id = int(line.split(':')[0])
        disjoint_set[set_id]=[int(node) for node in line.split(':')[1].split(',')]
    return disjoint_set

In [None]:
%%time
disjoint_set = read_set("../data/set.txt")

In [None]:
len(disjoint_set)

找出最大的并查集

In [None]:
%%time
disjoint_set = read_set("../Kluster/data/set.txt")
biggest_set = 0
bs_len = 1
for set_id,node_list in disjoint_set.items():
    if len(node_list)>bs_len:
        biggest_set = set_id
        bs_len = len(node_list)

print(bs_len)
print(disjoint_set[biggest_set])

找到最大并查集中的项，分析其相似性

In [None]:
set_vec = [clean_vec[vec_id] for vec_id in disjoint_set[biggest_set]]
edm = ED(set_vec[:1000],set_vec[:1000])
print(edm)
matplotlib.pyplot.matshow(edm)

对比双十一数据

In [None]:
csv_data = pd.read_csv("../data/double11_1020_1120.csv")
csv_data.fillna(0.0,inplace=True)
csv_data *= 100.0
csv_data_u = csv_data.round(5).drop_duplicates(subset=csv_data.columns[1:],keep='first')

# csv_data_u = csv_data_u.sample(n=65536, frac=None, replace=False, weights=None, random_state=None, axis=0)
csv_data_u_cut = csv_data_u.iloc[:,1:]
csv_data_u_float = csv_data_u_cut.astype('float32')
print(csv_data_u_float.shape)

# for x in csv_data_u_float.duplicated():
#     if (x is True):
#         print("duplication exist")
#         break

# 2进制数组
with open("../data/eco_nodes",'wb') as bin_output:
    csv_data_u_float.values.tofile(bin_output)

# with open("../Kluster/data/eco_nodes.csv",'w') as csv_output:
#     csv_data_u.to_csv(csv_output)

In [None]:
%%time
node_num_c = csv_data_u_float.shape[0]
node_dim_c = csv_data_u_float.shape[1]
threshold_c = 0.1
os.system(' '.join(["cd ..; cd bin; ./linker ../data/eco_nodes ../data/eco_edges.csv",str(node_num_c),str(node_dim_c),str(threshold_c)]))
eco_edge = pd.read_csv("../Kluster/data/eco_edges.csv") 

In [None]:
eco_edge['distance'].hist(bins=200)

In [None]:
%%time
disjoint_set = read_set("../Kluster/data/set.txt")
biggest_set = 0
bs_len = 1
for set_id,node_list in disjoint_set.items():
    if len(node_list)>bs_len:
        biggest_set = set_id
        bs_len = len(node_list)

print(bs_len)
print(disjoint_set[biggest_set])

In [None]:
set_vec = [csv_data_u_float.iloc[vec_id] for vec_id in disjoint_set[biggest_set]]
edm = ED(set_vec[:1000],set_vec[:1000])
print(edm)
matplotlib.pyplot.matshow(edm)