In [1]:
from __future__ import unicode_literals, print_function, division
import pandas as pd
import os
from io import open
import glob
import unicodedata
import string
import torch
import numpy as np
import math
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings
import pickle

warnings.filterwarnings(action = 'ignore')

import gensim 
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader as api
from gensim import utils

print(gensim.__version__)

3.6.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
cwd = '/content/drive/My Drive/nlp_project/'

In [None]:
def load_embeddings(path):
    embedding_map = {}
    with open(path) as f:
        for line in f:
            try:
                pieces = line.rstrip().split()
                embedding_map[pieces[0]] = [float(weight) for weight in pieces[1:]]
            except:
                pass
    return embedding_map

def my_save_word2vec_format(fname, vocab, vectors, binary=True, total_vec=2):
    if not (vocab or vectors):
        raise RuntimeError("no input")
    if total_vec is None:
        total_vec = len(vocab)
    vector_size = vectors.shape[1]
    assert (len(vocab), vector_size) == vectors.shape
    with utils.smart_open(fname, 'wb') as fout:
        print(total_vec, vector_size)
        fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
        # store in sorted order: most frequent words at the top
        for word, row in vocab.items():
            if binary:
                row = row.astype(REAL)
                fout.write(utils.to_utf8(word) + b" " + row.tostring())
            else:
                fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))

d = load_embeddings(cwd + 'sgns.sogou.bigram.txt')
print('model loaded')
m = gensim.models.keyedvectors.Word2VecKeyedVectors(vector_size=300)
m.vocab = d
m.vectors = np.array(list(d.values()))
print('write to file')
my_save_word2vec_format(binary=False, fname=cwd + 'chinese_words.txt', total_vec=len(d), vocab=m.vocab, vectors=m.vectors)
print('finish')


model loaded
write to file
364171 300
finish


# Evaluation

## get word set

In [4]:
data = pd.read_excel(cwd + '2018 payroll.xlsx', header = None)

In [5]:
print(len(data))

36951


In [6]:
word_set = set()
for i in range(len(data)):
  situation = str(data[31][i]).split() + str(data[32][i]).split()
  for w in situation:
    word_set.add(w)
print(word_set)
print(len(word_set))

{'没下', '洪先生', '没什么', '注册', 'One', 'CRM', '配合', '国庆节', '要以', '拓', '热点', '每月', '泰科', '明天', '孙鹏', '机电工程', '净增', '经常出现', '潮州市', '信号', '有人', '成其', '推销', '方应', '688', '合同条款', '536', 'Q', '0762', '拍照', '没记', '太贵', '动态', '运', '逻辑', '从业', '定', '发不', '前置', '签订合同', 'us', '代办', '人和', '设备', '换人', '规律', '用品', '两端', '1.7', '先包', '低下', '移动网', '惠州市', '不安', '群发', '绩', '周导', '转给', '积分', '暂停', '下班', '只', '周三前', '吴伟', '711', 'es', '灰白色', '虹', '都选', '新能源', 'KEY', '单打', '街道', '艾', '346', '药膏', '全新', '案例', '录制', '96', '王先生', '063', '内', '勿扰', 'F', '回绝', '片区', '转成', '审批权', '交接', '打通', '交出来', 'NM', '列车', '收起', '移到', '317', '拦截', '缺少', '媒体业', '未上市', '先要', '备货', '声音', '此', '558', '470', '项目', 'S-', '时自', '点选', '服务期', '嘉银', '地板', '没听', '支支吾吾', '555', '很久', '招聘', '绿康', '润滑油', '黄双', '胡', '花生', '畅', '讲清楚', '翼', '使用指南', '据信', '了账', '家家', '演示', '348', '*', '取到', '一团乱', '么', '及时', '延长', '杜先生', '详细', '未来', '前台', '奔腾', '郝', '范小姐', '在线', '减低', '技术', '一早', '100', '消息', '观察', '无业', '莹莹', '转达', '标注', '67', '包装', '×', '网络界', '

## evaluate similarities

In [7]:
model = KeyedVectors.load_word2vec_format(cwd + 'chinese_words.txt', binary=False)

In [9]:
eval_words = ['网络','手机', '定位', '服务', '错误', '数据', '设置', '华为', '查看', '客户', '文件', '苹果','模块']

eval_words_map = {}

for word in eval_words:
  print(word)
  eval_words_map[word] = set(model.wv.similar_by_word(word, topn=200))
  sim_word_set = set()
  for sim_word in eval_words_map[word]:
    if sim_word[0] in word_set:
      print(sim_word)
      sim_word_set.add(sim_word[0])
  eval_words_map[word] = sim_word_set

网络
('视频', 0.49045369029045105)
('用户账号', 0.4880825877189636)
('网络连接', 0.4905589818954468)
('网路', 0.5477644205093384)
('移动网', 0.48785609006881714)
('网络界', 0.506811797618866)
('CRM', 0.4880991280078888)
('互联', 0.51678466796875)
('QQ', 0.5266445279121399)
手机
('刷机', 0.5005427002906799)
('新手机', 0.5542560815811157)
('重拨', 0.5123473405838013)
('电脑', 0.5064799785614014)
('部手机', 0.5981699228286743)
('笔记本', 0.5356241464614868)
('平板', 0.5096657276153564)
('兼容问题', 0.5106188058853149)
('手机卡', 0.5094220638275146)
('诺基亚', 0.49413013458251953)
('移动电话', 0.5167263150215149)
('白屏', 0.494381308555603)
('机时', 0.4934317469596863)
('手机软件', 0.5104949474334717)
('PC', 0.5169134736061096)
('IP', 0.5196048617362976)
('终端', 0.5033129453659058)
('QQ', 0.5067571401596069)
定位
('时间轴', 0.4591441750526428)
('网络状', 0.4284420907497406)
('导入到', 0.42671823501586914)
('客户群', 0.43275028467178345)
('GPS', 0.4544619619846344)
('定位问题', 0.44625309109687805)
('WEB', 0.4273630380630493)
('web', 0.43357449769973755)
服务
('付款方', 0.552

In [11]:
# for word in eval_words:
#   print(word)
#   dist = model.distances(word, eval_words_map[word])
#   for i in range(len(dist)):
#     print(list(eval_words_map[word])[i], dist[i])
#   print()

In [13]:
sim_words = ['网路', '互联', 'QQ', '部手机', '新手机', '笔记本', 
             '时间轴', 'GPS', '定位问题', '付款方', 'CRM','咨询服务', 
             '正确', '错', '有误', '统计数据', '统计', '显示',
             '设立', '设', '可设', '中兴', '宇龙', '诺基亚',
             '点击', '仔细', '核查', '客户群', '短信通','用户',
             '规定', '抄送', '程序有', '谷歌', '黑莓', '诺基亚',
             '功能模块', 'CA', 'CRM']
eval_words

['网络', '手机', '定位', '服务', '错误', '数据', '设置', '华为', '查看', '客户', '文件', '苹果', '模块']

In [15]:
for word in eval_words:
  print(word)
  for sim in sim_words:
    print(sim, model.similarity(word, sim))
  print()

网络
网路 0.5477645
互联 0.51678467
QQ 0.5266445
部手机 0.21766092
新手机 0.29048124
笔记本 0.22005607
时间轴 0.42260474
GPS 0.37178415
定位问题 0.30445313
付款方 0.33003592
CRM 0.48809916
咨询服务 0.2239828
正确 0.18185362
错 0.13899451
有误 0.17791264
统计数据 0.183779
统计 0.19009843
显示 0.21561338
设立 0.20277989
设 0.1803557
可设 0.2286203
中兴 0.18474887
宇龙 0.2967091
诺基亚 0.27126616
点击 0.16547534
仔细 0.16047184
核查 0.19476423
客户群 0.23241821
短信通 0.41876405
用户 0.32762933
规定 0.21374558
抄送 0.21360153
程序有 0.3312874
谷歌 0.27065572
黑莓 0.25440145
诺基亚 0.27126616
功能模块 0.34067437
CA 0.3264036
CRM 0.48809916

手机
网路 0.27799645
互联 0.32165334
QQ 0.5067572
部手机 0.59816986
新手机 0.55425614
笔记本 0.5356241
时间轴 0.35846925
GPS 0.40034872
定位问题 0.2844387
付款方 0.39532447
CRM 0.42827758
咨询服务 0.15267555
正确 0.106189534
错 0.1684431
有误 0.11905158
统计数据 0.11687635
统计 0.12575659
显示 0.16806346
设立 0.07833815
设 0.085539654
可设 0.2137481
中兴 0.29563206
宇龙 0.45245203
诺基亚 0.49413013
点击 0.12713341
仔细 0.16700518
核查 0.13106717
客户群 0.21492335
短信通 0.42468518
用户 0.36716282
规定 0.18