In [1]:
from __future__ import unicode_literals, print_function, division
import pandas as pd
import os
from io import open
import glob
import unicodedata
import string
import torch
import numpy as np
import math
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings
import pickle

warnings.filterwarnings(action = 'ignore')

import gensim 
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader as api
from gensim import utils

print(gensim.__version__)

3.6.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
cwd = '/content/drive/My Drive/nlp_project/'

In [None]:
def load_embeddings(path):
    embedding_map = {}
    with open(path) as f:
        for line in f:
            try:
                pieces = line.rstrip().split()
                embedding_map[pieces[0]] = [float(weight) for weight in pieces[1:]]
            except:
                pass
    return embedding_map

def my_save_word2vec_format(fname, vocab, vectors, binary=True, total_vec=2):
    if not (vocab or vectors):
        raise RuntimeError("no input")
    if total_vec is None:
        total_vec = len(vocab)
    vector_size = vectors.shape[1]
    assert (len(vocab), vector_size) == vectors.shape
    with utils.smart_open(fname, 'wb') as fout:
        print(total_vec, vector_size)
        fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
        # store in sorted order: most frequent words at the top
        for word, row in vocab.items():
            if binary:
                row = row.astype(REAL)
                fout.write(utils.to_utf8(word) + b" " + row.tostring())
            else:
                fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))

d = load_embeddings(cwd + 'sgns.sogou.bigram.txt')
print('model loaded')
m = gensim.models.keyedvectors.Word2VecKeyedVectors(vector_size=300)
m.vocab = d
m.vectors = np.array(list(d.values()))
print('write to file')
my_save_word2vec_format(binary=False, fname=cwd + 'chinese_words.txt', total_vec=len(d), vocab=m.vocab, vectors=m.vectors)
print('finish')


model loaded
write to file
364171 300
finish


# Evaluation

## get word set

In [5]:
data = pd.read_excel(cwd + '2018 payroll.xlsx', header = None)

In [6]:
print(len(data))

36951


In [7]:
word_set = set()
for i in range(len(data)):
  situation = str(data[31][i]).split() + str(data[32][i]).split()
  for w in situation:
    word_set.add(w)
print(word_set)
print(len(word_set))

{'单页', 'PP', '通考', '出示', '促销员', '看', '中联', '泽', '830', '估计', '冲', '长虹', '描述', '关于', '111', '个字符', '发展', '804', '个例', '剩', '县城', '中下', '微云', '15万', '返回', 'LA', '730', '冠', '路线', '表述', '内侧', '五一节', '周口', '出差', '分配', '下属', '常发', '观海', '不停', '锦州', '具体内容', '跟上', '345', '说不清楚', '送货', '售价', '重复', '建', '根', '金立', '骂', '天水市', '影响', '电瓶', '吉林省', '痴呆', '发发', '靠谱', '宣讲', '没关', '经销商', '报销', '咨询员', '拼音', '时尚', '再次出现', '母女', '1887', '端', '待审', '遵义', '2014', '很早', '接电话', '询问', '袁浩', '耗费', '绿色', '序', '开关', '换位', '回手', '玩', '郭爽', '欧阳', '您', '每月', '汪玉', '文档', '养护', '常用', '量过大', '并', '报', 'st', '刚用', '第', '周五', '大格局', '策划', '转回', '缺少', '536', 'xp', '市里', '叱', '张爱', '季度', '新乡市', '地址', '唐盛', '光电', '经', '由', '侯女士', '进在', '为人', '人定', '电脑操作', '昊', '宜静', '白纸黑字', '插入', '洁', '加到', '阶段', '交尾', '九', '天下', '538', '显', '以使', '拨', '全天', '没测', '马女士', '123', '谱', '体型', '临时', '想开', '熟悉', '萍', '秦', '再加', '源', '故障', '贵阳市', 'it', '293', '想尽', '机电工程', '龙翔', 'C', '免费', '要账', '家教', '赞', '题图', '进不了', '泰州市', '王慧', '王亚', '涂', '加人

## evaluate similarities

In [8]:
model = KeyedVectors.load_word2vec_format(cwd + 'chinese_words.txt', binary=False)

In [11]:
eval_words = ['网络','手机', '定位', '服务', '错误', '数据', '锤子', '设置', '华为', '查看', '客户', '文件', '苹果','模块']

eval_words_map = {}

for word in eval_words:
  print(word)
  eval_words_map[word] = set(model.wv.similar_by_word(word, topn=200))
  sim_word_set = set()
  for sim_word in eval_words_map[word]:
    if sim_word[0] in word_set:
      print(sim_word)
      sim_word_set.add(sim_word[0])
  eval_words_map[word] = sim_word_set

网络
('网络连接', 0.4905589818954468)
('网路', 0.5477644205093384)
('QQ', 0.5266445279121399)
('视频', 0.49045369029045105)
('用户账号', 0.4880825877189636)
('网络界', 0.506811797618866)
('互联', 0.51678466796875)
('CRM', 0.4880991280078888)
('移动网', 0.48785609006881714)
手机
('笔记本', 0.5356241464614868)
('移动电话', 0.5167263150215149)
('刷机', 0.5005427002906799)
('机时', 0.4934317469596863)
('QQ', 0.5067571401596069)
('兼容问题', 0.5106188058853149)
('IP', 0.5196048617362976)
('白屏', 0.494381308555603)
('重拨', 0.5123473405838013)
('电脑', 0.5064799785614014)
('手机软件', 0.5104949474334717)
('诺基亚', 0.49413013458251953)
('终端', 0.5033129453659058)
('PC', 0.5169134736061096)
('手机卡', 0.5094220638275146)
('新手机', 0.5542560815811157)
('平板', 0.5096657276153564)
('部手机', 0.5981699228286743)
定位
('web', 0.43357449769973755)
('GPS', 0.4544619619846344)
('WEB', 0.4273630380630493)
('客户群', 0.43275028467178345)
('时间轴', 0.4591441750526428)
('网络状', 0.4284420907497406)
('定位问题', 0.44625309109687805)
('导入到', 0.42671823501586914)
服务
('咨询服务', 0.56

In [14]:
for word in eval_words:
  print(word)
  dist = model.distances(word, eval_words_map[word])
  for i in range(len(dist)):
    print(list(eval_words_map[word])[i], dist[i])
  print()

网络
QQ 0.4733554
网络界 0.4931882
视频 0.5095462
用户账号 0.5119174
CRM 0.5119009
互联 0.48321533
移动网 0.51214385
网络连接 0.509441
网路 0.45223558

手机
重拨 0.48765278
手机软件 0.4895051
笔记本 0.46437585
QQ 0.49324286
刷机 0.49945736
电脑 0.49352008
平板 0.49033433
机时 0.5065683
移动电话 0.48327363
兼容问题 0.48938125
手机卡 0.49057788
白屏 0.5056187
部手机 0.40183026
新手机 0.44574398
终端 0.49668705
PC 0.48308653
诺基亚 0.50586987
IP 0.48039508

定位
时间轴 0.5408558
定位问题 0.55374694
网络状 0.57155794
导入到 0.57328176
WEB 0.57263696
GPS 0.54553807
web 0.5664255
客户群 0.5672498

服务
短信通 0.48183674
付款方 0.44769323
咨询服务 0.43342173
CRM 0.47013187

错误
it 0.55127764
说错 0.53170055
偏差 0.5454315
选错 0.5429124
用错 0.53095376
有错 0.5611312
弄错 0.5590298
失败 0.5566236
错 0.4898368
出错 0.52867526
正确 0.44975203
有误 0.48970658
错误信息 0.5437404

数据
数据报 0.5361957
统计数字 0.4586829
旬报 0.5253856
数字 0.5339682
栏位 0.5440171
数据量 0.50616
统计 0.3457935
统计数据 0.34330255
薛静 0.54263175
显示 0.3978607
历史数据 0.42059743
CRM 0.5390059
数据表 0.4544922

锤子
钉钉 0.43636328

设置
划分 0.5218867
设定 0.45938337
弹窗 0.53

In [None]:
with open(cwd + 'tokenized_word_set.pickle', 'rb') as f:
    word_set = pickle.load(f)

In [None]:
print(type(word_set))

<class 'set'>
