In [2]:
import os
import jieba
import pprint
import re, string
from collections import defaultdict
from gensim.models.keyedvectors import KeyedVectors
from smart_open import open # for transparently opening compressed files

In [3]:
class MyCorpus:
    """Corpus that handles one document at a time
    """
    def __init__(self, root_path, file_list):
        """
        Args:
            root_path - root path to the files
            file_list - list of website files
        """
        self.root_path = root_path
        self.file_list = file_list
    
    def __iter__(self):
        for filename in self.file_list:
            with open(root_path + filename, encoding='utf-8') as f:
                for line in f:
                    if line == '' or line.startswith('\r') or line.startswith('WARC') or line.startswith('Content'):
                        continue
                    line = re.sub(r'[a-zA-Z]', '', line)  # remove English characters
                    line = re.sub('[%s]' % (string.punctuation + string.digits), '', line)  # remove digits
                    if line == '\r':
                        continue
                    yield jieba.cut(line, cut_all=False)

In [4]:
file_list = [f for f in os.listdir('../webdata') if f.startswith('part-')][:2]
my_corpus = MyCorpus('../webdata', file_list)

FileNotFoundError: [Errno 2] No such file or directory: '../webdata'

In [6]:
embedding_file = '../embedding/Tencent_AILab_ChineseEmbedding.txt'
wv_from_text = KeyedVectors.load_word2vec_format(embedding_file, binary=False)

In [7]:
wv_from_text.most_similar(positive=['女人', '皇帝'], negative=['男人'])

[('皇后', 0.8758628368377686),
 ('太后', 0.8436370491981506),
 ('皇太后', 0.8422078490257263),
 ('妃子', 0.8377931118011475),
 ('嫔妃', 0.8314578533172607),
 ('后宫嫔妃', 0.8293792009353638),
 ('后宫妃子', 0.8252508044242859),
 ('后妃', 0.8230723738670349),
 ('贵妃', 0.8157583475112915),
 ('妃嫔', 0.8150229454040527)]

In [8]:
wv_from_text.similar_by_word('体育')

[('及体育', 0.745394229888916),
 ('以及体育', 0.7444100379943848),
 ('体育传媒', 0.7442153096199036),
 ('体育科技', 0.7429567575454712),
 ('包括体育', 0.7428886294364929),
 ('体育方面', 0.7395599484443665),
 ('体育领域', 0.7245116233825684),
 ('体育产业', 0.7185760736465454),
 ('体育相关', 0.7166426181793213),
 ('娱乐体育', 0.7142459750175476)]

In [9]:
wv_from_text.similarity('体育', '金融')

0.5286123