In [1]:
import os
import jieba
import pprint
import re, string
import time
from collections import defaultdict
from gensim import corpora
from gensim.models.keyedvectors import KeyedVectors
from smart_open import open # for transparently opening compressed files

### Corpus class for getting one document at a time from the whole dataset

In [43]:
class MyCorpus:
    """Corpus that handles one document at a time
    """
    
    def __init__(self, root_path, file_list, dictionary, stop_list, city_list):
        """
        Args:
            root_path - root path to the website files
            file_list - list of website files
            dictionary - mapping from words to ids
            stop_list - chinese stopwords set
            city_list - chinese city set
        """
        self.root_path = root_path
        self.file_list = file_list
        self.dictionary = dictionary
        self.stop_list = stop_list
        self.city_list = city_list
    
    def __iter__(self):
        for filename in self.file_list:
            with open(self.root_path + filename, encoding='utf-8') as f:
                for line in f:
                    words = self._process(line)
                    if not words or len(words) < 2:  # less than 2 words won't contain 2 cities
                        continue
                    words, cities = self._retrieve_cities(words)
                    if len(cities) < 2:  # less than 2 cities won't composite a link
                        continue
#                     yield {'words': self.dictionary.doc2bow(words), 'cities': cities}
                    yield {'words': words, 'cities': cities}
                    
    def _process(self, line):
        # drop meta-info
        if line == '' or line.startswith('\r') or line.startswith('WARC') or line.startswith('Content'):
            return
        # drop alphabetic characters
        line = re.sub(r'[a-zA-Z]', '', line)
        # drop digits and punctuations
        line = re.sub('[%s]' % (string.punctuation + string.digits), '', line)
        # drop empty line
        if line == '\r':
            return
        # segment the sentence using jieba
        words = ' '.join(jieba.cut(line, cut_all=False)).split(' ')
        # drop stopwords
        words = [word for word in words if word not in self.stop_list]
        return words
    
    def _retrieve_cities(self, words):
        """Caution: cities are removed from the document because cities are not supposed to related to any category
        """
        cities = []
        indices = []
        for idx, word in enumerate(words):
            if word in self.city_list:
                cities.append(word)
                indices.append(idx)
        # remove cities from the document
        for idx in indices[::-1]:
            del words[idx]
        return words, cities

### Class that calculates the frequency of every category in a document

In [45]:
class FrequencyCalculator:
    """Calculate frequency of every category in a document by compare the similarities betweeen each word and the keyword
    """
    
    def __init__(self, wv, nclass, keywords, th):
        """
        Args:
            wv - word vectors
            nclass - number of categories
            keywords - 文化，经济，体育等. Nested list. [[经济,金融...],[科技,互联网...]...], 一共nclass类
            th - similarity threshold. Similarites Under the threshold will be discarded.
        """
        self.wv = wv
        self.nclass = nclass
        self.freq = {i: 0 for i in range(1, nclass + 1)}
        self.keywords = keywords
        self.th = th
    
    def calc(self, words):
        """Calculate frequency and return it"""
        for word in words:
            if word not in self.wv:
                continue
            for i, category in enumerate(self.keywords, 1):
                for key in category:
                    if self.wv.similarity(word, key) > self.th:
                        self.freq[i] += 1
        return self.freq

### Class that converts frequency of every document to frequency of two city link in the whole corpus

In [None]:
class FrequencyConverter

### Load the dictionary

In [3]:
if 'dict.dict' in os.listdir('../dict'):
    dictionary = corpora.Dictionary().load('../dict/dict.dict')  # already processed from embedding file
else:
    texts = []
    with open('../embedding/Tencent_AILab_ChineseEmbedding.txt') as f:
        skip_head = True
        for line in f:
            if skip_head:
                skip_head = False
                continue
            else:
                texts.append(line.split(' ')[0])
    dictionary = corpora.Dictionary([texts])
    dictionary.save('../dict/dict.dict')
print(dictionary)

Dictionary(8824330 unique tokens: ['汪儒', '老李太太熏酱', '名侦探柯南剧场版15', '安佩佩', '雲端計算']...)


### Load the word embeddings

In [8]:
embedding_file = '../embedding/Tencent_AILab_ChineseEmbedding.txt'
wv_from_text = KeyedVectors.load_word2vec_format(embedding_file, binary=False)

### Load the stopwords, city names and keywords

In [26]:
stop_list = []
with open('resources/stopwords_zh.txt') as f:
    for line in f:
        stop_list.append(line[:-1])
stop_list = set(stop_list)

city_list = []
with open('resources/China_Cities_Coordinates_CHN_ENG.csv') as f:
    skip_head = True
    for line in f:
        if skip_head:
            skip_head = False
            continue
        else:
            city_list.append(line.split(',')[0])
city_list = set(city_list)

nclass = 7
keywords = [[] for _ in range(nclass)]
with open('resources/keywords.csv') as f:
    for line in f:
        line = line.replace('\n', '')
        for i, category in enumerate(line.split(',')):
            if category != '' and category in wv_from_text:
                keywords[i].append(category)

### Instantiate the corpus and frequency calculator

In [48]:
file_list = [f for f in os.listdir('../webdata') if f.startswith('part-')][:1]
my_corpus = MyCorpus('../webdata/', file_list, dictionary, stop_list, city_list)
freq_calc = FrequencyCalculator(wv_from_text, nclass=nclass, keywords=keywords, th=0.8)

### Main run part

In [None]:
frequency = {i: 0 for i in range(1, nclass + 1)}  # final frequency

start = time.time()

cnt = 0
for document in my_corpus:
    if cnt > 20:
        break
    cnt += 1
    _freq = freq_calc.calc(document["words"])
    print(_freq)
    
end = time.time()
print('{} documents (websites) in total. {} (avg: {}) seconds elapsed.'.format(cnt, end - start), (end - start) / 20)

{1: 133, 2: 95, 3: 17, 4: 32, 5: 45, 6: 73, 7: 20}
{1: 146, 2: 101, 3: 21, 4: 39, 5: 50, 6: 78, 7: 20}
{1: 157, 2: 108, 3: 25, 4: 41, 5: 58, 6: 90, 7: 20}
{1: 161, 2: 120, 3: 26, 4: 43, 5: 61, 6: 91, 7: 36}
{1: 181, 2: 132, 3: 26, 4: 44, 5: 61, 6: 99, 7: 36}
{1: 196, 2: 141, 3: 26, 4: 45, 5: 61, 6: 104, 7: 36}
{1: 204, 2: 144, 3: 29, 4: 47, 5: 63, 6: 108, 7: 36}
{1: 207, 2: 147, 3: 32, 4: 54, 5: 76, 6: 113, 7: 36}
{1: 248, 2: 227, 3: 33, 4: 73, 5: 83, 6: 193, 7: 50}
{1: 267, 2: 275, 3: 33, 4: 73, 5: 89, 6: 256, 7: 54}
{1: 293, 2: 323, 3: 33, 4: 77, 5: 119, 6: 319, 7: 58}
{1: 293, 2: 323, 3: 33, 4: 77, 5: 119, 6: 320, 7: 58}
{1: 361, 2: 398, 3: 36, 4: 88, 5: 127, 6: 343, 7: 60}
{1: 427, 2: 478, 3: 38, 4: 95, 5: 177, 6: 368, 7: 61}
{1: 432, 2: 480, 3: 42, 4: 97, 5: 178, 6: 368, 7: 61}
{1: 438, 2: 480, 3: 42, 4: 97, 5: 179, 6: 376, 7: 61}
{1: 440, 2: 484, 3: 46, 4: 97, 5: 183, 6: 379, 7: 61}
{1: 458, 2: 489, 3: 46, 4: 102, 5: 198, 6: 387, 7: 61}
{1: 463, 2: 492, 3: 48, 4: 104, 5: 201, 6: 

## Scratch (not important)

<!-- 1. Dictionary -->
<!-- 2. Stop words -->
<!-- 3. Remove without city -->
<!-- 4. Store city links -->
5. Count words related with different category

In [8]:
wv_from_text.similar_by_word('体育')

[('及体育', 0.745394229888916),
 ('以及体育', 0.7444100379943848),
 ('体育传媒', 0.7442153096199036),
 ('体育科技', 0.7429567575454712),
 ('包括体育', 0.7428886294364929),
 ('体育方面', 0.7395599484443665),
 ('体育领域', 0.7245116233825684),
 ('体育产业', 0.7185760736465454),
 ('体育相关', 0.7166426181793213),
 ('娱乐体育', 0.7142459750175476)]

In [9]:
wv_from_text.similarity('体育', '金融')

0.5286123