In [None]:
from __future__ import unicode_literals, print_function, division
import pandas as pd
import os
from io import open
import glob
import unicodedata
import string
import torch
import numpy as np
import math
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings
import pickle

warnings.filterwarnings(action = 'ignore')

import gensim 
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader as api
from gensim import utils

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
cwd = '/content/drive/My Drive/nlp_project/'

In [None]:
def load_embeddings(path):
    embedding_map = {}
    with open(path) as f:
        for line in f:
            try:
                pieces = line.rstrip().split()
                embedding_map[pieces[0]] = [float(weight) for weight in pieces[1:]]
            except:
                pass
    return embedding_map

def my_save_word2vec_format(fname, vocab, vectors, binary=True, total_vec=2):
    if not (vocab or vectors):
        raise RuntimeError("no input")
    if total_vec is None:
        total_vec = len(vocab)
    vector_size = vectors.shape[1]
    assert (len(vocab), vector_size) == vectors.shape
    with utils.smart_open(fname, 'wb') as fout:
        print(total_vec, vector_size)
        fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
        # store in sorted order: most frequent words at the top
        for word, row in vocab.items():
            if binary:
                row = row.astype(REAL)
                fout.write(utils.to_utf8(word) + b" " + row.tostring())
            else:
                fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))

d = load_embeddings(cwd + 'sgns.sogou.bigram.txt')
print('model loaded')
m = gensim.models.keyedvectors.Word2VecKeyedVectors(vector_size=300)
m.vocab = d
m.vectors = np.array(list(d.values()))
print('write to file')
my_save_word2vec_format(binary=False, fname=cwd + 'chinese_words.txt', total_vec=len(d), vocab=m.vocab, vectors=m.vectors)
print('finish')


model loaded
write to file
364171 300
finish


# Evaluation

## get word set

In [None]:
data = pd.read_excel(cwd + '2018 payroll.xlsx', header = None)

In [None]:
print(len(data))

36951


In [None]:
word_set = set()
for i in range(len(data)):
  situation = str(data[31][i]).split() + str(data[32][i]).split()
  for w in situation:
    word_set.add(w)
print(word_set)
print(len(word_set))

{'等', '555', '一点', '危', '私人', '486', '1959', '见', '33', '彭', '完成', '菌', '时是', '投票', '佑', '王波', '制式', '陈惠', '会歌', '梅江', '聋', '漏', 'IT', '149', '电子科技', '1981', '住宿', '春酒', '山东', '工作日', '不分', '赠送', '尤洋', '种植', '张照', '990', '白', '字段', 'og', '会通', '闲', 'th', '385', '或', '未加', '缓和', '大部分', '小一', '程', '臣', '引导', '告', '哈', '通都', '张鑫', '过', '妍', '化', '姜海', '天安门', '审核', '铃声', '城市', '盖上', '一手', '教育科', '周一就', '34', '编码', '改变', '收起', '压缩', '筵', '流出', '修改权', '87', '控制员', '陵', '网络软件', '730', '损失', '讲', '喀什市', '那期', '华利', '用户', '算在内', 'qq', '路由器', '化学试剂', '都反', '申请号', '到时', '381', '下属', '艺', '杂音', '地方', '签完', '407', '空号', '114', '选', '；未', '英文版', '102', '731', '经销处', '时实', '木业', '长虹', '准', '中指', '仲', '费电', '负一', '种类', '上划', '打车', '都认', '回拨', 'g', '正常', '批示', '鸟', '王丽萍', '215', '让', '永康', '差距', '出错', '设立', '222', '以使', '百科', '历史记录', '昕', '额', '都加', '记录人', '51', '旧', 'as', '下回', '分别', '破解', '包装', '森', '酒店', '宾馆', '水印', '李想', '检测', '放在', '494', '悍', '发邮件', '延', '发错', '文', '大连', '十月', '试探', '涌泉', '清楚', '李

## evaluate similarities

In [None]:
model = KeyedVectors.load_word2vec_format(cwd + 'chinese_words.txt', binary=False)

In [None]:
eval_words = ['网络','手机', '定位', '服务', '错误', '响应', '处理', '已', '华为', '在线']

eval_words_map = {}

for word in eval_words:
  eval_words_map[word] = set(model.wv.similar_by_word(word, topn=100))
  sim_word_set = set()
  for sim_word in eval_words_map[word]:
    if sim_word[0] in word_set:
      sim_word_set.add(sim_word)
  
  print(sim_word_set)
  # print(model.wv.similar_by_word(word, topn=15))
  print()

{('互联', 0.51678466796875), ('网络界', 0.506811797618866), ('QQ', 0.5266445279121399), ('网路', 0.5477644205093384)}

{('部手机', 0.5981699228286743), ('新手机', 0.5542560815811157), ('IP', 0.5196048617362976), ('笔记本', 0.5356241464614868)}

{('时间轴', 0.4591441750526428), ('定位问题', 0.44625309109687805), ('GPS', 0.4544619619846344)}

{('付款方', 0.5523066520690918), ('咨询服务', 0.5665782690048218)}

{('出错', 0.47132471203804016), ('说错', 0.4682994484901428), ('错误信息', 0.45625966787338257), ('选错', 0.4570876359939575), ('有误', 0.5102933645248413), ('偏差', 0.4545685052871704), ('正确', 0.5502479076385498), ('错', 0.5101631879806519), ('用错', 0.46904629468917847)}

{('启动', 0.4724695682525635), ('反应', 0.40596288442611694)}

{('解决', 0.4550190567970276), ('未处理', 0.4568314552307129)}

{('尚未', 0.5858912467956543), ('并', 0.4743204116821289), ('已有', 0.511445939540863), ('现已', 0.7123315334320068), ('正在', 0.46389561891555786), ('一旦', 0.4568527936935425), ('都已', 0.5848924517631531), ('已将', 0.5623600482940674), ('全部', 0.4469055533

In [None]:
with open(cwd + 'tokenized_word_set.pickle', 'rb') as f:
    word_set = pickle.load(f)

In [None]:
print(type(word_set))

<class 'set'>
