In [1]:
import pandas as pd
import numpy as np
import gensim



# Load training set and testing set

In [2]:
train_df = pd.read_csv("../data/dataset/train.csv")
test_df = pd.read_csv("../data/dataset/test.csv")

In [3]:
train_corpus = np.unique([v for v in np.concatenate([train_df.title1_zh.unique(), train_df.title2_zh.unique()]) if type(v) == str])

In [4]:
test_corpus = np.unique([v for v in np.concatenate([test_df.title1_zh.unique(), test_df.title2_zh.unique()]) if type(v) == str])

In [5]:
all_corpus = np.concatenate([train_corpus, test_corpus])

In [6]:
with open('../data/corpus.txt', 'w', encoding='utf-8') as corpus:
    for sentence in all_corpus:
        for char in sentence:
            corpus.write(char + ' ')
        corpus.write('\n')

# Word2Vec

In [7]:
import logging
from gensim.models import word2vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.LineSentence('../data/corpus.txt')
model = word2vec.Word2Vec(sentences, sg=0, hs=0, window=5, size=50, min_count=5)
model.save("word2vec.model")

2018-12-26 20:52:04,554 : INFO : collecting all words and their counts
2018-12-26 20:52:04,554 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-12-26 20:52:04,603 : INFO : PROGRESS: at sentence #10000, processed 268913 words, keeping 3122 word types
2018-12-26 20:52:04,651 : INFO : PROGRESS: at sentence #20000, processed 527375 words, keeping 3764 word types
2018-12-26 20:52:04,697 : INFO : PROGRESS: at sentence #30000, processed 776732 words, keeping 4033 word types
2018-12-26 20:52:04,744 : INFO : PROGRESS: at sentence #40000, processed 1026248 words, keeping 4173 word types
2018-12-26 20:52:04,792 : INFO : PROGRESS: at sentence #50000, processed 1274561 words, keeping 4291 word types
2018-12-26 20:52:04,837 : INFO : PROGRESS: at sentence #60000, processed 1509703 words, keeping 4441 word types
2018-12-26 20:52:04,884 : INFO : PROGRESS: at sentence #70000, processed 1758393 words, keeping 4532 word types
2018-12-26 20:52:04,930 : INFO : PROGRESS: at sen

In [8]:
model.wv.save_word2vec_format('../data/wordvec/zh-wordvec-50-cbow-windowsize50.vec', binary=False)

2018-12-26 20:52:15,163 : INFO : storing 4113x50 projection weights into ../data/wordvec/zh-wordvec-50-cbow-windowsize50.vec


## Skipgram

In [9]:
model = word2vec.Word2Vec(sentences, sg=1, hs=1, window=7, size=50, min_count=5)
model.wv.save_word2vec_format('../data/wordvec/zh-wordvec-50-skipgram-windowsize7.vec', binary=False)

2018-12-26 20:52:15,267 : INFO : collecting all words and their counts
2018-12-26 20:52:15,267 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-12-26 20:52:15,316 : INFO : PROGRESS: at sentence #10000, processed 268913 words, keeping 3122 word types
2018-12-26 20:52:15,364 : INFO : PROGRESS: at sentence #20000, processed 527375 words, keeping 3764 word types
2018-12-26 20:52:15,412 : INFO : PROGRESS: at sentence #30000, processed 776732 words, keeping 4033 word types
2018-12-26 20:52:15,459 : INFO : PROGRESS: at sentence #40000, processed 1026248 words, keeping 4173 word types
2018-12-26 20:52:15,505 : INFO : PROGRESS: at sentence #50000, processed 1274561 words, keeping 4291 word types
2018-12-26 20:52:15,551 : INFO : PROGRESS: at sentence #60000, processed 1509703 words, keeping 4441 word types
2018-12-26 20:52:15,598 : INFO : PROGRESS: at sentence #70000, processed 1758393 words, keeping 4532 word types
2018-12-26 20:52:15,644 : INFO : PROGRESS: at sen

2018-12-26 20:52:52,059 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-12-26 20:52:52,102 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-12-26 20:52:52,113 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-12-26 20:52:52,114 : INFO : EPOCH - 2 : training on 5701379 raw words (4976156 effective words) took 17.8s, 279562 effective words/s
2018-12-26 20:52:53,134 : INFO : EPOCH 3 - PROGRESS: at 5.01% examples, 264095 words/s, in_qsize 5, out_qsize 0
2018-12-26 20:52:54,147 : INFO : EPOCH 3 - PROGRESS: at 10.65% examples, 273185 words/s, in_qsize 5, out_qsize 0
2018-12-26 20:52:55,166 : INFO : EPOCH 3 - PROGRESS: at 16.44% examples, 276571 words/s, in_qsize 5, out_qsize 0
2018-12-26 20:52:56,184 : INFO : EPOCH 3 - PROGRESS: at 22.15% examples, 278048 words/s, in_qsize 5, out_qsize 0
2018-12-26 20:52:57,203 : INFO : EPOCH 3 - PROGRESS: at 28.09% examples, 278838 words/s, in_qsize 5, out_qsize 0
2018-12-26 20:52:58,207 :

# Fasttext

In [10]:
from gensim.models import FastText
fasttext_model = FastText(sentences, size=50, window=3, min_count=5)

2018-12-26 20:53:45,690 : INFO : collecting all words and their counts
2018-12-26 20:53:45,691 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-12-26 20:53:45,739 : INFO : PROGRESS: at sentence #10000, processed 268913 words, keeping 3122 word types
2018-12-26 20:53:45,787 : INFO : PROGRESS: at sentence #20000, processed 527375 words, keeping 3764 word types
2018-12-26 20:53:45,834 : INFO : PROGRESS: at sentence #30000, processed 776732 words, keeping 4033 word types
2018-12-26 20:53:45,881 : INFO : PROGRESS: at sentence #40000, processed 1026248 words, keeping 4173 word types
2018-12-26 20:53:45,927 : INFO : PROGRESS: at sentence #50000, processed 1274561 words, keeping 4291 word types
2018-12-26 20:53:45,972 : INFO : PROGRESS: at sentence #60000, processed 1509703 words, keeping 4441 word types
2018-12-26 20:53:46,018 : INFO : PROGRESS: at sentence #70000, processed 1758393 words, keeping 4532 word types
2018-12-26 20:53:46,065 : INFO : PROGRESS: at sen

2018-12-26 20:54:13,430 : INFO : EPOCH 5 - PROGRESS: at 15.43% examples, 791455 words/s, in_qsize 0, out_qsize 0
2018-12-26 20:54:14,432 : INFO : EPOCH 5 - PROGRESS: at 31.08% examples, 782357 words/s, in_qsize 0, out_qsize 0
2018-12-26 20:54:15,438 : INFO : EPOCH 5 - PROGRESS: at 46.93% examples, 780850 words/s, in_qsize 0, out_qsize 0
2018-12-26 20:54:16,440 : INFO : EPOCH 5 - PROGRESS: at 62.68% examples, 779129 words/s, in_qsize 0, out_qsize 0
2018-12-26 20:54:17,441 : INFO : EPOCH 5 - PROGRESS: at 78.36% examples, 779186 words/s, in_qsize 0, out_qsize 0
2018-12-26 20:54:18,442 : INFO : EPOCH 5 - PROGRESS: at 93.57% examples, 774114 words/s, in_qsize 0, out_qsize 0
2018-12-26 20:54:18,838 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-12-26 20:54:18,839 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-12-26 20:54:18,848 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-12-26 20:54:18,848 : INFO : EPOCH - 5 : trai

In [11]:
fasttext_model.wv.save_word2vec_format('../data/wordvec/fasttext-50-win3.vec', binary=False)

2018-12-26 20:54:18,875 : INFO : storing 4113x50 projection weights into ../data/wordvec/fasttext-50-win3.vec
