In [5]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from tqdm import tqdm_notebook as tqdm
import re
import math
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'IPAexGothic'

from scipy.cluster.hierarchy import linkage, dendrogram

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel, CoherenceModel, TfidfModel

import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

from wordcloud import WordCloud

In [6]:
import glob
from bs4 import BeautifulSoup

files = glob.glob('./work/raw/*html')

def parse(fileName):
    with open(fileName) as f:
        soup = BeautifulSoup(f, 'html.parser')

    title      = soup.select_one('article.blog-entry-article h1.blog-title').get_text()
    date       = soup.select_one('article.blog-entry-article div.blog-date').get_text()
    category   = soup.select_one('article.blog-entry-article li.blog-category').get_text()
    text       = soup.select_one('article.blog-entry-article div.content').get_text()
    
    
    return [fileName, title, date, category, text]


data = [parse(fileName) for fileName in tqdm(files)]

  0%|          | 0/68 [00:00<?, ?it/s]

In [7]:
df = pd.DataFrame(data, columns=['file', 'title', 'date', 'category', 'text'])
df.head()

Unnamed: 0,file,title,date,category,text
0,./work/raw/23.html,ソフトウェアエンジニアの採用にルーブリックを導入した話,2019.12.22,TECH,\n\n\nソフトウェアエンジニアの hota です。今回はソフトウェアエンジニアの採用につ...
1,./work/raw/35.html,Kubernetes + Fluentd + CloudWatch Logs,2019.12.8,TECH,ソフトウェアエンジニアのskirinoです。\n最近ではコンテナ化したアプリケーションの設定...
2,./work/raw/62.html,event timeとprocessing timeについて,2018.12.4,TECH,\nこんにちは。ソフトウェアエンジニアの田中伸弥です。\n\n\n時系列データのevent ...
3,./work/raw/9.html,フライウィール・データプラットフォームの紹介,2020.8.27,MARKETING,プロダクトマネージャーの横井啓介です。\n前回の投稿では、デジタルトランスフォーメーション戦...
4,./work/raw/19.html,渋谷オフィスへの引越し前に認証を引っ越した話 Part2-実践編,2019.12.25,TECH,こんにちは。FLYWHEELでソフトウェアエンジニアをしてますsaoiです。前回の投稿の投稿...


In [8]:
## 前処理
import MeCab
m = MeCab.Tagger('-d /usr/local/lib/mecab/dic/ipadic')

HINSHI = ['名詞']
STOPWORDS = ['フライウィール', 'flywheel', 'var', 'main', 'test', 'time', 'src', 'com', 'jp', 'れる', 'これ', 'なっ', 'それ', 'もの', 'たち', 'さん']

def parseText(text):
    node = m.parseToNode(text)
    words = []
    while node:
        fields = node.feature.split(",")
        word = node.surface.lower() # 小文字化
        word = re.sub(r'\d+', '0', word) # 数字置き換え        
        word = re.sub(r'[\.\/\(\){}\[\]:,?!;\*=_\-\'"@<>#\^%]+', '', word) # 記号除去
        if fields[0] in HINSHI and word not in STOPWORDS and len(word) > 1:
            words.append(word)
        node = node.next
    
    return words

In [9]:
df['words'] = df['text'].map(lambda text: parseText(text))
# df['words'] = df['title'].map(lambda text: parseText(text))
df['words'].head()

0    [ソフトウェア, エンジニア, hota, 今回, ソフトウェア, エンジニア, 採用, お...
1    [ソフトウェア, エンジニア, skirino, 最近, コンテナ, アプリケーション, 設...
2    [ソフトウェア, エンジニア, 田中, 系列, データ, event, processing...
3    [プロダクト, マネージャー, 横井, 啓介, 前回, 投稿, デジタルトランスフォーメーシ...
4    [ソフトウェア, エンジニア, saoi, 前回, 投稿, 投稿, 広告, 配信, プラット...
Name: words, dtype: object

In [10]:
## 辞書とコーパスの作成
dictionary = Dictionary(df['words'])
dictionary.filter_extremes(no_below=3, no_above=0.7)
print(len(dictionary))

# BoWコーパス
corpus = [dictionary.doc2bow(words) for words in df['words']]

# tfidfコーパス
tfidf = TfidfModel(corpus)
corpus = tfidf[corpus]

1388


In [11]:
from collections import defaultdict
from gensim.models.keyedvectors import KeyedVectors
from sklearn.cluster import KMeans

In [12]:
# 以下から最新の学習済みモデルをダウンロード
# https://github.com/singletongue/WikiEntVec/releases
# 今回利用したのは20190520のjawiki.all_vectors.100d.txt.bz2

model = KeyedVectors.load_word2vec_format('work/jawiki.all_vectors.100d.txt')

In [13]:
word = '理科'
results = model.wv.most_similar(word)
print(word, "と類似度の高い単語") 
for result in results:
    print(result)

理科 と類似度の高い単語
('##理科##', 0.849571943283081)
('図画工作', 0.796046257019043)
('家庭科', 0.7760507464408875)
('算数', 0.771944522857666)
('##物理##', 0.7701616883277893)
('技術・家庭', 0.7691288590431213)
('数学科', 0.7614375352859497)
('社会科', 0.7590746879577637)
('教室', 0.7588263750076294)
('##社会科##', 0.756842851638794)


In [14]:
data = [[word, model.wv[word]] for word in dictionary.values() if word in model.wv]
df = pd.DataFrame(data, columns=['word', 'vectors'])

In [15]:
df.head()

Unnamed: 0,word,vectors
0,docs,"[1.2660389, 0.34894383, 0.06823624, -0.2455865..."
1,google,"[0.23882352, -0.2745437, 0.7677373, -0.0346424..."
2,slack,"[0.39975637, 0.25642583, 0.061183497, -0.16204..."
3,step,"[0.18174207, 0.32680133, 0.090867065, 0.041584..."
4,yuku,"[0.3732086, 0.019794274, 0.23433594, -0.034450..."


In [16]:
df.shape

(1358, 2)

In [17]:
distortions = []

for i  in tqdm(range(1,21)):
    km = KMeans(n_clusters=i, verbose=1, random_state=42, n_jobs=-1)
    km.fit(list(df['vectors']))
    distortions.append(km.inertia_)

  0%|          | 0/20 [00:00<?, ?it/s]

9352518656
Iteration 15, inertia 7429.0377874400865
Iteration 16, inertia 7427.880095724072
Iteration 17, inertia 7427.0444719371535
Iteration 18, inertia 7426.168181539817
Iteration 19, inertia 7425.891861685699
Converged at iteration 19: strict convergence.
Initialization complete
Iteration 0, inertia 11188.105388510137
Iteration 1, inertia 7813.7541908799685
Iteration 2, inertia 7623.760169408297
Iteration 3, inertia 7541.725255607942
Iteration 4, inertia 7508.449894760235
Iteration 5, inertia 7485.869012846227
Iteration 6, inertia 7470.621683805539
Iteration 7, inertia 7459.528007913799
Iteration 8, inertia 7449.892731687623
Iteration 9, inertia 7442.114326037677
Iteration 10, inertia 7435.845543721089
Iteration 11, inertia 7432.5823029202775
Iteration 12, inertia 7429.35458674993
Iteration 13, inertia 7427.164611714131
Iteration 14, inertia 7425.731373957678
Iteration 15, inertia 7424.5065613226225
Iteration 16, inertia 7422.405625433067
Iteration 17, inertia 7421.585962328431
Ite

In [18]:
plt.figure(figsize=(6, 6))
plt.plot(range(1,21),distortions,marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()
plt.savefig('work/word_kmeans_sse.png')

In [19]:
n_clusters = 6
kmeans_model = KMeans(n_clusters=n_clusters, verbose=1, random_state=42, n_jobs=-1)
kmeans_model.fit(list(df['vectors']))

Initialization complete
Iteration 0, inertia 13364.914030932356
Iteration 1, inertia 8568.89295146249
Iteration 2, inertia 8431.99489032674
Iteration 3, inertia 8389.704071283459
Iteration 4, inertia 8371.265384138382
Iteration 5, inertia 8363.042380375007
Iteration 6, inertia 8358.770672255894
Iteration 7, inertia 8357.473219079942
Iteration 8, inertia 8356.906599641734
Iteration 9, inertia 8356.630958098234
Iteration 10, inertia 8356.261238944006
Iteration 11, inertia 8356.06704274175
Iteration 12, inertia 8355.983165400727
Iteration 13, inertia 8355.896862146235
Converged at iteration 13: strict convergence.
Initialization complete
Iteration 0, inertia 14487.943100157521
Iteration 1, inertia 8871.798353055467
Iteration 2, inertia 8671.087220100588
Iteration 3, inertia 8591.056782228023
Iteration 4, inertia 8546.67823298141
Iteration 5, inertia 8515.323213433136
Iteration 6, inertia 8493.59207379496
Iteration 7, inertia 8485.042367293669
Iteration 8, inertia 8479.576400272537
Iterati

KMeans(n_clusters=6, n_jobs=-1, random_state=42, verbose=1)

In [20]:
df['cluster'] = kmeans_model.labels_

In [21]:
for i in range(n_clusters):
    print('## Label '+str(i))
    print(','.join(df[df['cluster']==i]['word']))
    print()


## Label 0
slack,step,bash,create,filter,first,format,from,group,head,in,it,json,key,log,logs,name,or,out,path,pull,remove,ruby,run,runtime,size,system,to,true,values,analytics,and,azure,bing,development,notes,of,play,processing,the,version,英語,excel,as,identity,java,management,part,service,single,token,ui,do,en,hello,length,map,on,public,request,spring,void,++,add,all,args,array,be,builder,class,code,collect,def,dependency,else,error,for,get,how,if,import,input,integration,is,load,make,master,new,not,note,object,one,output,python,return,spark,sql,storage,that,tools,use,using,value,with,word,you,zip,engineering,pm,project,calendar,function,javascript,null,your,cto,link,amazon,customer,line,next,users,demand,platform,real,by,learning,chrome,join,privacy,require,set,actions,cloud,git,push,uses,timestamp,numpy,channel,clap,emoji,tmp,end,challenges,infrastructure,models,bot,ceo,goal,what,task,manager,engineer,product,standard

## Label 1
カルチャー,フィット,フォーカス,レビュー,人材,今後,会社,体制,個人,全社,分野,募集,大学,完成,実

In [22]:
### 主成分分析
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(list(df['vectors']))

PCA()

In [23]:
color = list(matplotlib.colors.TABLEAU_COLORS.items())[:10]
color = [x[0] for x in color]

feature = pca.transform(list(df['vectors']))
plt.figure(figsize=(6, 6))
for x, y, name, cluster in zip(feature[:, 0], feature[:, 1], df['word'], df['cluster']):
    plt.text(x, y, name, alpha=0.8, size=5, color=color[cluster])
plt.scatter(feature[:, 0], feature[:, 1], alpha=0.8)
plt.title("Principal Component Analysis")
plt.xlabel("The first principal component score")
plt.ylabel("The second principal component score")
plt.show()
plt.savefig('work/word_kmeans.png')