In [86]:
import os
import time
import jieba
import pprint
import re, string
import numpy as np
from gensim import corpora
from sklearn.cluster import KMeans
from collections import defaultdict
from gensim.models.keyedvectors import KeyedVectors
from smart_open import open # for transparently opening compressed files

### Corpus class for getting one document at a time from the whole dataset

In [43]:
class MyCorpus:
    """Corpus that handles one document at a time
    """
    
    def __init__(self, root_path, file_list, dictionary, stop_list, city_list):
        """
        Args:
            root_path - root path to the website files
            file_list - list of website files
            dictionary - mapping from words to ids
            stop_list - chinese stopwords set
            city_list - chinese city set
        """
        self.root_path = root_path
        self.file_list = file_list
        self.dictionary = dictionary
        self.stop_list = stop_list
        self.city_list = city_list
    
    def __iter__(self):
        for filename in self.file_list:
            with open(self.root_path + filename, encoding='utf-8') as f:
                for line in f:
                    words = self._process(line)
                    if not words or len(words) < 2:  # less than 2 words won't contain 2 cities
                        continue
                    words, cities = self._retrieve_cities(words)
                    if len(cities) < 2:  # less than 2 cities won't composite a link
                        continue
#                     yield {'words': self.dictionary.doc2bow(words), 'cities': cities}
                    yield {'words': words, 'cities': cities}
                    
    def _process(self, line):
        # drop meta-info
        if line == '' or line.startswith('\r') or line.startswith('WARC') or line.startswith('Content'):
            return
        # drop alphabetic characters
        line = re.sub(r'[a-zA-Z]', '', line)
        # drop digits and punctuations
        line = re.sub('[%s]' % (string.punctuation + string.digits), '', line)
        # drop empty line
        if line == '\r':
            return
        # segment the sentence using jieba
        words = ' '.join(jieba.cut(line, cut_all=False)).split(' ')
        # drop stopwords
        words = [word for word in words if word not in self.stop_list]
        return words
    
    def _retrieve_cities(self, words):
        """Caution: cities are removed from the document because cities are not supposed to related to any category
        """
        cities = []
        indices = []
        for idx, word in enumerate(words):
            if word in self.city_list:
                cities.append(word)
                indices.append(idx)
        # remove cities from the document
        for idx in indices[::-1]:
            del words[idx]
        return words, cities

### Class that calculates the frequency of every category in a document

In [52]:
class FrequencyCalculator:
    """Calculate frequency of every category in a document by compare the similarities betweeen each word and the keyword
    """
    
    def __init__(self, wv, nclass, keywords, th):
        """
        Args:
            wv - word vectors
            nclass - number of categories
            keywords - 文化，经济，体育等. Nested list. [[经济,金融...],[科技,互联网...]...], 一共nclass类
            th - similarity threshold. Similarites Under the threshold will be discarded.
        """
        self.wv = wv
        self.nclass = nclass
        self.keywords = keywords
        self.th = th
    
    def calc(self, words):
        """Calculate frequency and return it"""
        freq = {i: 0 for i in range(1, self.nclass + 1)}
        for word in words:
            if word not in self.wv:
                continue
            for i, category in enumerate(self.keywords, 1):
                for key in category:
                    if self.wv.similarity(word, key) > self.th:
                        freq[i] += 1
        return freq

### Class that converts frequency of every document to frequency of two city link in the whole corpus

In [None]:
class FrequencyConverter

### Load the dictionary

In [3]:
if 'dict.dict' in os.listdir('../dict'):
    dictionary = corpora.Dictionary().load('../dict/dict.dict')  # already processed from embedding file
else:
    texts = []
    with open('../embedding/Tencent_AILab_ChineseEmbedding.txt') as f:
        skip_head = True
        for line in f:
            if skip_head:
                skip_head = False
                continue
            else:
                texts.append(line.split(' ')[0])
    dictionary = corpora.Dictionary([texts])
    dictionary.save('../dict/dict.dict')
print(dictionary)

Dictionary(8824330 unique tokens: ['汪儒', '老李太太熏酱', '名侦探柯南剧场版15', '安佩佩', '雲端計算']...)


### Load the word embeddings

In [8]:
embedding_file = '../embedding/Tencent_AILab_ChineseEmbedding.txt'
wv_from_text = KeyedVectors.load_word2vec_format(embedding_file, binary=False)

### Load the stopwords, city names and keywords

In [26]:
stop_list = []
with open('resources/stopwords_zh.txt') as f:
    for line in f:
        stop_list.append(line[:-1])
stop_list = set(stop_list)

city_list = []
with open('resources/China_Cities_Coordinates_CHN_ENG.csv') as f:
    skip_head = True
    for line in f:
        if skip_head:
            skip_head = False
            continue
        else:
            city_list.append(line.split(',')[0])
city_list = set(city_list)

nclass = 7
keywords = [[] for _ in range(nclass)]
with open('resources/keywords.csv') as f:
    for line in f:
        line = line.replace('\n', '')
        for i, category in enumerate(line.split(',')):
            if category != '' and category in wv_from_text:
                keywords[i].append(category)

### Instantiate the corpus and frequency calculator

In [53]:
file_list = [f for f in os.listdir('../webdata') if f.startswith('part-')][:1]
my_corpus = MyCorpus('../webdata/', file_list, dictionary, stop_list, city_list)
freq_calc = FrequencyCalculator(wv_from_text, nclass=nclass, keywords=keywords, th=0.8)

### Main run part

In [54]:
frequency = {i: 0 for i in range(1, nclass + 1)}  # final frequency

start = time.time()

cnt = 0
for document in my_corpus:
    if cnt > 20:
        break
    cnt += 1
    _freq = freq_calc.calc(document["words"])
    print(_freq)
    
end = time.time()

{1: 17, 2: 5, 3: 1, 4: 4, 5: 6, 6: 2, 7: 0}
{1: 13, 2: 6, 3: 4, 4: 7, 5: 5, 6: 5, 7: 0}
{1: 11, 2: 7, 3: 4, 4: 2, 5: 8, 6: 12, 7: 0}
{1: 4, 2: 12, 3: 1, 4: 2, 5: 3, 6: 1, 7: 16}
{1: 20, 2: 12, 3: 0, 4: 1, 5: 0, 6: 8, 7: 0}
{1: 15, 2: 9, 3: 0, 4: 1, 5: 0, 6: 5, 7: 0}
{1: 8, 2: 3, 3: 3, 4: 2, 5: 2, 6: 4, 7: 0}
{1: 3, 2: 3, 3: 3, 4: 7, 5: 13, 6: 5, 7: 0}
{1: 41, 2: 80, 3: 1, 4: 19, 5: 7, 6: 80, 7: 14}
{1: 19, 2: 48, 3: 0, 4: 0, 5: 6, 6: 63, 7: 4}
{1: 26, 2: 48, 3: 0, 4: 4, 5: 30, 6: 63, 7: 4}
{1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 1, 7: 0}
{1: 68, 2: 75, 3: 3, 4: 11, 5: 8, 6: 23, 7: 2}
{1: 66, 2: 80, 3: 2, 4: 7, 5: 50, 6: 25, 7: 1}
{1: 5, 2: 2, 3: 4, 4: 2, 5: 1, 6: 0, 7: 0}
{1: 6, 2: 0, 3: 0, 4: 0, 5: 1, 6: 8, 7: 0}
{1: 2, 2: 4, 3: 4, 4: 0, 5: 4, 6: 3, 7: 0}
{1: 18, 2: 5, 3: 0, 4: 5, 5: 15, 6: 8, 7: 0}
{1: 5, 2: 3, 3: 2, 4: 2, 5: 3, 6: 3, 7: 0}
{1: 0, 2: 1, 3: 1, 4: 5, 5: 38, 6: 0, 7: 0}
{1: 2, 2: 3, 3: 17, 4: 9, 5: 4, 6: 0, 7: 0}


In [55]:
print('{} documents (websites) in total. {} (avg: {}) seconds elapsed.'.format(cnt, end - start, (end - start) / 21))

21 documents (websites) in total. 128.19871544837952 (avg: 6.1047007356371195) seconds elapsed.


### Cluster the keywords with k-means

In [98]:
keys = []
for cate in keywords:
    keys.extend(cate)
print(keys)

X = np.array([wv_from_text[k] for k in keys])
kmeans = KMeans(n_clusters=7, random_state=0, verbose=1).fit(X)
kmeans.labels_[:]

['经济', '金融', '公司', '产业', '金融', '证券', '理财', '银行', '基金', '保险', '融资', '财政', '投资', '服务', '钱庄', '会计', '管理', '审计', '消费', '市场', '利率', '财富', '信托', '外汇', '期货', '债券', '商品', '价值', '货币', '成本', '市场', '信用', '边际', '搭便车', '马太效应', '帕累托', '供求', '红利', '房地产', '媒体', '科技', '互联网', '通信', '区块链', '人工智能', '创业', '创新', '数码', '技术', '生产', '效率', '理工', '信息', '电信', '计算机', '量子', '数据结构', '软件', '流量', '神经网络', '大数据', '识别', '计算机图像', '计算机视觉', '图像处理', '数据处理', '设备', '媒体', '腾讯', '阿里', '字节跳动', '谷歌', '研发', '华为', '苹果', '爱立信', '智能', '芯片', '云服务', '在线服务', '电信', '法律', '政治', '政府', '共产', '资本主义', '政党', '权力', '权利', '义务', '主义', '民主', '否决权', '世俗主义', '乱世', '列强', '合法性', '社会', '宪法', '宪政', '右派', '左派', '盛世', '中国梦', '三个代表', '科学发展观', '小康', '毛泽东', '邓小平', '江泽民', '胡锦涛', '习近平', '官员', '阶级', '马克思', '列宁', '选角', '治理', '改革', '开放', '领导', '主席', '总统', '奥巴马', '特朗普', '希拉里', '文学', '教育', '艺术', '收藏', '思想', '民俗', '农家乐', '出国', '留学', '公开课', '学校', '大学', '高考', '中小学', '商学院', '考研', '博士', '科研', '英语', '高校', '博物馆', '展览', '社会', '文化', '传统', '戏剧', '地理', '自由行', '背包客', '领队', '携程'

array([4, 1, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 6, 1, 6, 6, 1, 6,
       1, 1, 1, 1, 0, 6, 1, 6, 6, 1, 1, 4, 6, 6, 1, 6, 1, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 0, 6, 4, 6, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 5, 2, 4, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 4, 6, 4, 4, 4, 4, 4, 4, 6, 5, 4, 4,
       4, 3, 6, 4, 4, 4, 4, 4, 4, 4, 6, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5,
       3, 3, 3, 3, 3, 3, 3, 4, 6, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       6, 6, 5, 4, 4, 5, 5, 5, 5, 6, 2, 2, 2, 0, 6, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 0, 0, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 4, 4, 0, 0, 0], dtype=int32)

#### Scratch (not important)

<!-- 1. Dictionary -->
<!-- 2. Stop words -->
<!-- 3. Remove without city -->
<!-- 4. Store city links -->
5. Count words related with different category

In [8]:
wv_from_text.similar_by_word('体育')

[('及体育', 0.745394229888916),
 ('以及体育', 0.7444100379943848),
 ('体育传媒', 0.7442153096199036),
 ('体育科技', 0.7429567575454712),
 ('包括体育', 0.7428886294364929),
 ('体育方面', 0.7395599484443665),
 ('体育领域', 0.7245116233825684),
 ('体育产业', 0.7185760736465454),
 ('体育相关', 0.7166426181793213),
 ('娱乐体育', 0.7142459750175476)]

In [9]:
wv_from_text.similarity('体育', '金融')

0.5286123