In [28]:
import pickle
import random
from itertools import combinations

import networkx as nx
import pandas as pd
import requests
from gensim.models import Word2Vec
from numpy.random import choice
from tqdm import tqdm

In [2]:
ACCESS_TOKEN = "xxxxxxxx YOUR ACCESS TOKEN xxxxxxxxxxxxx"

In [3]:
result = []
for page in tqdm(range(100)):
    url = f"https://qiita.com/api/v2/items?page={page + 1}&per_page=100"
    res = requests.get(url, headers={"Authorization": ACCESS_TOKEN}).json()

    tags = [content['tags'] for content in res]

    tmp_result = []
    for tag in tags:
        tmp_result.append([skill['name'].lower() for skill in tag])
    result = result + tmp_result

100%|██████████| 100/100 [02:20<00:00,  1.41s/it]


In [5]:
result[:5]

[['aws', 'adventcalendar', 'apigateway', 'chalice', 'adventcalendar2020'],
 ['atcoder', 'elixir', '競技プログラミング', 'atcoderbeginnersselection'],
 ['dart'],
 ['linux', 'setting'],
 ['python']]

In [19]:
graph = nx.Graph()

for lst in result:
    graph.add_edges_from(list(combinations(lst, 2)))

In [31]:
 def make_random_walks(G: nx.Graph, num_of_walk: int, length_of_walk: int):
    """
    ランダムウォークによってタグのシーケンスを取得するよ
    Args:
        G (nx.Graph): 同一記事のタグをつないだ無向グラフ
        num_of_walk (int): 各ノードに関して何回サンプリングを行うかの数
        length_of_walk (int): 1回のサンプリングの際にどれだけ歩かせるか
    """
    walks = list()
    for _ in tqdm(range(num_of_walk), total=num_of_walk):
        node_list = list(G.nodes())
        for node in node_list:
            now_node = node
            walk = list()
            walk.append(str(node))
            for _ in range(length_of_walk):
                try:
                    neighbors = list(G.neighbors(now_node))
                    if not neighbors:
                        break
                    next_node = choice(neighbors)
                except IndexError:
                    break
                walk.append(str(next_node))
                now_node = next_node
            walks.append(walk)
    return walks

In [32]:
walks = make_random_walks(graph, 100, 20)

100%|██████████| 100/100 [04:16<00:00,  2.56s/it]


# word2vecの学習

In [33]:
model = Word2Vec(walks,  sg=1, size=100, window=5, min_count=1, workers=8)

In [55]:
[tpl[0] for tpl in model.wv.most_similar('python')]

['pymagnitude',
 'randomforest',
 'shade3d',
 '自分用',
 'magicpod',
 'robotframework',
 '内包表記',
 'リスト',
 'youtube-dl',
 'pythonで作る対話システム']

In [60]:
[tpl[0] for tpl in model.wv.most_similar('azure')]

['azuremonitor',
 'azurepurview',
 'azurepowershell',
 'applicationinsights',
 'webジョブ',
 'azuremediaservices',
 'synapseanalytics',
 'botframeworkcomposer',
 'managedidentity',
 'luis']

In [57]:
[tpl[0] for tpl in model.wv.most_similar('機械学習')]

['確率補正',
 'calibration',
 'isotonicregression',
 'pavアルゴリズム',
 '交互作用',
 '特徴量エンジニアリング',
 'statsmodels',
 '拡張分析',
 'augmentedanalitics',
 '論文解説']

In [56]:
model.wv.most_similar('elm')

[('functionalprogramming', 0.8857020139694214),
 ('関数合成', 0.8789885640144348),
 ('internetexplorer', 0.8745540976524353),
 ('pipe', 0.8676517605781555),
 ('elm-review', 0.8639591336250305),
 ('elmer', 0.7788057327270508),
 ('caniuse', 0.7356892824172974),
 ('ie11', 0.7347599864006042),
 ('animation', 0.684481680393219),
 ('frontend', 0.6611098051071167)]

In [70]:
model.wv.most_similar(positive=['python', "機械学習"])

[('音声処理', 0.8429374694824219),
 ('ランダムフォレスト', 0.8065807819366455),
 ('formula1', 0.8005763292312622),
 ('勾配ブースティング', 0.7914688587188721),
 ('交互作用', 0.7776237726211548),
 ('音声強調', 0.7754710912704468),
 ('scipy', 0.7659810781478882),
 ('googleadwords', 0.7613731622695923),
 ('超解像', 0.7584062218666077),
 ('sklearn', 0.7562590837478638)]

In [74]:
model.wv.most_similar(positive=['javascript', "functionalprogramming"])

[('elm-review', 0.8832534551620483),
 ('関数合成', 0.7859140634536743),
 ('pipe', 0.7728371024131775),
 ('webworker', 0.7684996724128723),
 ('suncalc', 0.7666928768157959),
 ('実況中継', 0.7608648538589478),
 ('elm', 0.7596915364265442),
 ('stimulus', 0.7511614561080933),
 ('作曲', 0.7468655109405518),
 ('webview2', 0.743710994720459)]

In [36]:
model.wv.most_similar('scala')

[('cats', 0.956039309501648),
 ('http4s', 0.9476245641708374),
 ('sbt', 0.9147008061408997),
 ('pac4j', 0.9019726514816284),
 ('bigdata', 0.8832607865333557),
 ('エラーハンドリング', 0.8705583810806274),
 ('akka', 0.8525885939598083),
 ('playframework', 0.7916499376296997),
 ('openjdk', 0.7824023365974426),
 ('spark', 0.7186453342437744)]

In [37]:
model.wv.most_similar('php')

[('twig', 0.924010157585144),
 ('宇宙船演算子', 0.9136295318603516),
 ('クイズ', 0.904780924320221),
 ('realpath', 0.903666615486145),
 ('smarty', 0.9019123315811157),
 ('pdo', 0.8918749690055847),
 ('larabel', 0.8899241089820862),
 ('シリアライズ', 0.8846566081047058),
 ('mcrypt', 0.8843409419059753),
 ('入力フォーム', 0.8840829133987427)]

In [40]:
model.wv.most_similar('react')

[('component', 0.9182481169700623),
 ('webvitals', 0.9172523021697998),
 ('react-draggable', 0.916003406047821),
 ('react-infinite-scroller', 0.9066747426986694),
 ('react-color', 0.9012477397918701),
 ('無限スクロール', 0.8961509466171265),
 ('immer', 0.8947430849075317),
 ('immutable-js', 0.8842179179191589),
 ('react-dnd', 0.8682639598846436),
 ('react-dnd-html5-backend', 0.8657857775688171)]

# 中心性

In [48]:
centrality_dict = nx.eigenvector_centrality(graph)
sorted(centrality_dict.items(), key=lambda x: -x[1])[:10]

[('python', 0.28000768071879917),
 ('初心者', 0.2320177123993606),
 ('javascript', 0.21932383647891893),
 ('aws', 0.178129586501316),
 ('ruby', 0.1487758407289078),
 ('docker', 0.1469139551312583),
 ('rails', 0.12914160411633122),
 ('php', 0.12821083523259985),
 ('node.js', 0.1281337032843693),
 ('react', 0.11173280219079293)]