In [23]:
import json
import jieba as jb

In [24]:
def preprocess(data):
    data = remove_space(data)
    data = format_str(data)
    data = list(jb.cut(data))
    return data

def remove_space(data):
    r = data.replace(' ', '')
    return r

def is_chinese(uchar):
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return True
    else:
        return False

def format_str(content):
    content_str = ''
    for i in content:
        if is_chinese(i):
            content_str = content_str + i
    return content_str

In [25]:
# dev.txt

dev = []
with open('./data/dev.txt', encoding='utf-8') as f:
    for line in f.readlines():
        line = line.replace('\n', '')
        dev.append(line)

In [26]:
len(dev)

10000

In [27]:
dev[0]

'全身瘦产品，泰国DC。泰国yanhee一对一定制，提供视频包邮'

In [28]:
for i in range(len(dev)):
    dev[i] = preprocess(dev[i])

In [29]:
dev[0]

['全身', '瘦', '产品', '泰国', '泰国', '一对一', '定制', '提供', '视频', '包邮']

In [30]:
# entity_kb.txt
entity = []
with open('./data/entity_kb.txt') as f:
    for line in f.readlines():
        line = line.replace('\n', '')
        line = json.loads(line)
        entity.append(line)

In [31]:
len(entity)

277296

In [32]:
entity[0]

{'type': 'Publication',
 'subject_id': 242847,
 'subject': '新出吐鲁番文书及其研究',
 'data': [{'predicate': '出版社', 'object': '新疆人民出版社'},
  {'predicate': '作者', 'object': '柳洪亮 著'},
  {'predicate': '出版时间', 'object': '1997'},
  {'predicate': '卷数', 'object': None},
  {'predicate': '装帧', 'object': '精装'}]}

In [33]:
# convert entity to sentence
entity_sentences = []
for e in entity:
    if e['subject'] is None:
        continue
    entity_sentence = {}
    entity_sentence['subject_id'] = e['subject_id']
    entity_sentence['sentence'] = e['subject']
    for d in e['data']:
        if d['object'] is not None:
            entity_sentence['sentence'] += str(d['object'])
            
    entity_sentences.append(entity_sentence)

In [34]:
len(entity_sentences)

277292

In [35]:
entity_sentences[0]

{'subject_id': 242847, 'sentence': '新出吐鲁番文书及其研究新疆人民出版社柳洪亮 著1997精装'}

In [36]:
entities = []
for i in range(len(entity_sentences)):
    entity_sentences[i]['sentence'] = preprocess(entity_sentences[i]['sentence'])
    entities.append(entity_sentences[i]['sentence'])

In [37]:
entities[0]

['新出', '吐鲁番', '文书', '及其', '研究', '新疆人民出版社', '柳', '洪亮', '著', '精装']

In [38]:
# train.txt

train = []

with open('./data/train.txt', encoding='utf-8') as f:
    for line in f.readlines():
        line = line.replace('\n', '')
        line = json.loads(line)
        train.append(line)

In [39]:
len(train)

83192

In [40]:
train[0]

{'text_id': 137590,
 'text': '香港代购 金油万软膏 新加坡虎牌万金油19.4g克',
 'implicit_entity': [{'subject': '虎标万金油', 'subject_id': 140025}]}

In [None]:
train_text = []
for t in train:
    train_text.append(preprocess(t['text']))

In [None]:
from gensim.models import word2vec
import numpy as np

all_data = train_text + entities + dev

w2v_model = word2vec.Word2Vec(all_data, size=256, min_count=1, window=5)

In [None]:
def sentence2vec(sentence):
    res = []
    for i in range(len(sentence)):
        if sentence[i] in w2v_model.wv.vocab:
            res.append(w2v_model[sentence[i]])
        else:
            res.append(np.zeros(w2v_model.vector_size))
    return res

In [None]:
for i in range(len(dev)):
    dev[i] = sentence2vec(dev[i])
    temp = np.zeros(w2v_model.vector_size)
    for k in dev[i]:
        temp += k
    
    temp = temp / len(dev[i])
    dev[i] = temp

In [None]:
for i in range(len(entity_sentences)):
    entity_sentences[i]['sentence'] = sentence2vec(entity_sentences[i]['sentence'])
    temp = np.zeros(w2v_model.vector_size)
    for k in entity_sentences[i]['sentence']:
        temp += k
    
    temp = temp / len(entity_sentences[i]['sentence'])
    entity_sentences[i]['sentence'] = temp

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from tqdm import tqdm

ans = []
for i in tqdm(range(len(dev))):
    max_id = -1
    max_cos = -1
    
    for j in range(len(entity_sentences)):
        temp = [dev[i], entity_sentences[j]['sentence']]
        
        try:
            cos = cosine_similarity(np.array(temp))[0][1]
        
            if cos > max_cos:
                max_cos = cos
                max_id = entity_sentences[j]['subject_id'] 
        except:
            continue
            
    ans.append(max_id)

In [None]:
len(ans)

In [None]:
for k in ans:
    print(k)