<a href="https://colab.research.google.com/github/takada-at/sep_crawl/blob/master/sep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# @title クローラーの準備
% cd /content
! if [ ! -d sep_crawl ]; then git clone https://github.com/takada-at/sep_crawl.git; fi
% cd sep_crawl
! git pull
! pip install -r requirements-colab.txt
! mkdir -p data/sep

In [0]:
# @title SEP記事のダウンロード
# @markdown 80分程度かかります。
% cd /content/sep_crawl/sep/
! if [ -d ../data/sep ]; then rm -r ../data/sep; fi
! time scrapy crawl entry --loglevel=INFO
! cat ../data/sep/text/*.txt > ../data/sep/sep-entries.txt

In [0]:
# @title ダウンロードしたファイルの確認
# 行数
! wc -l /content/sep_crawl/data/sep/sep-entries.txt 
! head /content/sep_crawl/data/sep/sep-entries.txt 
# ダウンロードした記事の確認
! ls /content/sep_crawl/data/sep/text | head
! ls /content/sep_crawl/data/sep/text | tail
! ls /content/sep_crawl/data/sep/text | wc -l

In [0]:
# @title Google Driveにバックアップ
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

% cd /content
! if [ -d sep-entries ]; then rm -r sep-entries; fi
! mkdir sep-entries
! if [ -f sep-entries.zip ]; then rm sep-entries.zip; fi
! cp /content/sep_crawl/data/sep/sep-entries.txt sep-entries/
! zip /content/sep-entries.zip sep-entries/*
! cp /content/sep-entries.zip "/content/drive/My Drive/"

In [0]:
# @title Google Driveからインポート

from google.colab import drive
drive.mount('/content/drive',  force_remount=True)
! cp "/content/drive/My Drive/sep-entries.zip" /content/sep-entries.zip 
% cd /content
! unzip sep-entries.zip
! mv sep-entries/sep-entries.txt /content/sep_crawl/data/sep/sep-entries.txt

In [0]:
# @title word2vecモデルの作成
from datetime import datetime
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec

sentences = LineSentence('/content/sep_crawl/data/sep/sep-entries.txt')
s = datetime.now()
w2v = Word2Vec(sentences, size=300)
e = datetime.now()
print('create word2vec model: {}'.format(e - s))

In [0]:
# @title 単語ベクトルの確認

import numpy as np

print(len(w2v.wv.vocab))
words = np.array(tuple(w2v.wv.vocab.keys()))
print(w2v.wv.most_similar_cosmul(positive=["kant"], topn=3))
# [('hegel', 0.7388843894004822),
# ('hume', 0.7110885381698608),
# ('spinoza', 0.6922929286956787)]

print(w2v.wv.most_similar_cosmul(positive=["paris", "germany"], negative=['france'], topn=3))

In [0]:
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.cm as cm
from sklearn.manifold import TSNE

terms = ['kant']
clusters = [0]
n = 5
for i in range(15):
  result = w2v.wv.most_similar_cosmul(positive=terms, topn=n)
  terms += [r[0] for r in result]
  clusters += [i + 1] * n
vectors = np.vstack([w2v.wv[t] for t in terms])
print(terms)

tsne = TSNE(n_components=2, perplexity=50.0)
matrix = np.vstack(vectors)
v2d = tsne.fit_transform(matrix)
x = v2d[:, 0]
y = v2d[:, 1]
plt.figure(figsize=(12, 10))
s = plt.scatter(x, y, c=clusters, cmap='viridis')
plt.colorbar(s)
plt.grid(True)
for i, n in enumerate(terms):
  plt.annotate(n, xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
plt.show()