In [1]:
import re
import jieba
import gensim
import numpy as np
from gensim import corpora, models, similarities
from gensim.models import KeyedVectors
from pprint import pprint
from collections import defaultdict
from tqdm import tqdm

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
def load_stopwords(stopwords_path):
    with open(stopwords_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f]
    
def preprocess_data(corpus_path, stopwords):
    corpus = []
    with open(corpus_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            line = re.sub(r"[0-9\s+\.\!\/_,$%^*()?;；:-【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+", " ", line)
            corpus.append(' '.join([word for word in jieba.lcut(line) if word != " " and word != "\t" and word not in stopwords]))
    return corpus

def cut_low_freq_word(documents, thre):
    texts = [[word for word in document.split()] for document in documents]
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    return [[token for token in text if frequency[token] > thre] for text in texts]

def cut_low_freq_word2(documents, thre):
    texts = [[word for word in document.split()] for document in documents]
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    return [(" ").join([token for token in text if frequency[token] > thre]) for text in texts]

In [3]:
stopwords_path = "./data/stop_words.txt"
doc_title_east = "./data/title_east.txt"
doc_title_apple= "./data/title_apple.txt"

In [4]:
stopwords = load_stopwords(stopwords_path)
corp_title_east = preprocess_data(doc_title_east, stopwords)
corp_title_apple = preprocess_data(doc_title_apple, stopwords)

Building prefix dict from the default dictionary ...
2020-07-27 15:52:00,163 : DEBUG : Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/m3/4yh806w92fdgcn0bk16ql7nw0000gn/T/jieba.cache
2020-07-27 15:52:00,166 : DEBUG : Loading model from cache /var/folders/m3/4yh806w92fdgcn0bk16ql7nw0000gn/T/jieba.cache
Loading model cost 0.632 seconds.
2020-07-27 15:52:00,796 : DEBUG : Loading model cost 0.632 seconds.
Prefix dict has been built successfully.
2020-07-27 15:52:00,797 : DEBUG : Prefix dict has been built successfully.


In [5]:
texts_title_east = cut_low_freq_word(corp_title_east, 3)
texts_title_apple = cut_low_freq_word(corp_title_apple, 3)

In [6]:
model_w2v = gensim.models.Word2Vec(texts_title_east, size=300)
model_w2v.intersect_word2vec_format('./data/sgns.financial.char.bz2',
                                lockf=1.0,
                                encoding='utf8',
                                )
model_w2v.train(texts_title_east,total_examples=model_w2v.corpus_count, epochs=10)

2020-07-27 15:52:21,747 : INFO : collecting all words and their counts
2020-07-27 15:52:21,748 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-07-27 15:52:21,761 : INFO : PROGRESS: at sentence #10000, processed 67573 words, keeping 7657 word types
2020-07-27 15:52:21,776 : INFO : PROGRESS: at sentence #20000, processed 137486 words, keeping 9275 word types
2020-07-27 15:52:21,796 : INFO : PROGRESS: at sentence #30000, processed 200510 words, keeping 9961 word types
2020-07-27 15:52:21,813 : INFO : PROGRESS: at sentence #40000, processed 262261 words, keeping 10177 word types
2020-07-27 15:52:21,827 : INFO : PROGRESS: at sentence #50000, processed 323506 words, keeping 10250 word types
2020-07-27 15:52:21,837 : INFO : collected 10262 word types from a corpus of 365311 raw words and 57222 sentences
2020-07-27 15:52:21,838 : INFO : Loading a fresh vocabulary
2020-07-27 15:52:21,852 : INFO : effective_min_count=5 retains 8804 unique words (85% of original 10

2020-07-27 15:55:30,028 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-07-27 15:55:30,028 : INFO : EPOCH - 10 : training on 365311 raw words (305573 effective words) took 0.3s, 1125331 effective words/s
2020-07-27 15:55:30,029 : INFO : training on a 3653110 raw words (3054868 effective words) took 2.7s, 1124149 effective words/s


(3054868, 3653110)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through)
title_vectorizer = tfidf.fit_transform(texts_title_east)

#to dense
title_east_tfidf = title_vectorizer.toarray()

In [8]:
#get weight matrix
title_east_weight = []
for i in range(len(title_east_tfidf)):
    idf_list = title_east_tfidf[i]
    temp = []
    for word in texts_title_east[i]:
        temp.append(idf_list[tfidf.vocabulary_[word]])
    temp = np.array(temp)
    title_east_weight.append(temp)
title_east_weight = np.array(title_east_weight)

In [9]:
#get word2vec matrix
title_east_w2v = []
for i in tqdm(range(len(texts_title_east))):
    sentence_w2v = []
    for word in texts_title_east[i]:
        if word not in model_w2v.wv.vocab:
            sentence_w2v.append(np.zeros((300,)))
        else:
            sentence_w2v.append(model_w2v[word])
    sentence_w2v = np.array(sentence_w2v)
    title_east_w2v.append(sentence_w2v)

  sentence_w2v.append(model_w2v[word])
100%|██████████| 57222/57222 [00:01<00:00, 29361.49it/s]


In [29]:
#compute sentence embedding
def compute_weighted_centroids(X, weights):
    centroids = []
    for question_vector_weight in zip(X, weights):
        if question_vector_weight:
            question_vectors, question_weights = question_vector_weight
            if np.sum(question_weights) != 0:
                centroids.append(np.average(question_vectors, weights=question_weights, axis=0))
            else:
                centroids.append(np.zeros(300,))                    
        else:
            centroids.append(np.zeros(300,))
    return np.array(centroids)

title_east_vec = compute_weighted_centroids(title_east_w2v, title_east_weight)
title_east_vec = np.stack(title_east_vec)

In [1]:
title_east_vec.shape

NameError: name 'title_east_vec' is not defined