In [41]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import json
import time
import pickle
import requests

In [90]:
class WikipediaAPIHandler(object):
    def __init__(self):
        self.base_query = "https://ja.wikipedia.org/w/api.php?action=query&format=json"
        
    def get_page_in_category(self, category):
        query = f"{self.base_query}&cmlimit=500&list=categorymembers&cmtitle=Category:{category}"
        resp_json = self._response(query)
        if resp_json is None:
            return None
        
        return {"article": self._get_member_article(resp_json), "subcategory": self._get_subcategory(resp_json)}
        
        
    def _response(self, query, error_wait=10, max_trial=10):
        i = 0
        while True:
            if i > max_trial:
                print("ERROR.")
                return None
            
            try:
                return requests.get(query).json()
            except requests.ConnectionError as e:
                print(e.args)
            except requests.HTTPError as e:
                print(e.args)
            except requests.Timeout as e:
                print(e.args)
            except requests.TooManyRedirects as e:
                print(e.args)
            except json.JSONDecodeError as e:
                print(e.args)
                return None
                
            time.sleep(error_wait)
            i += 1
            
    def _get_member_article(self, response):
        return [member['title'] for member in response['query']['categorymembers'] if member['ns'] is 0]
        
    def _get_subcategory(self, response):
        return [re.sub(r'^Category:', '', member['title']) for member in response['query']['categorymembers'] if member['ns'] is 14]

In [80]:
with open("../dump/noun_list.pickle", 'br') as f:
    noun_set = set(pickle.load(f))

In [91]:
wiki_handler = WikipediaAPIHandler()

In [94]:
category_list = pd.read_csv("../data/category_list.csv").category.tolist()
category_list = list(set(category_list) & noun_set)

i = 0
article_list = []
searched_category = []
while len(category_list) > 0:
    category = category_list.pop(0)
    
    if category in searched_category:
        continue
    else:
        searched_category.append(category)
    
    c_info = wiki_handler.get_page_in_category(category)
    time.sleep(1)
    
    if c_info is None:
        continue
    
    print(i, "\tcategory:", category)
    
    if len(c_info['article']) > 0:
        article_list.append(c_info['article'])
    if len(c_info['subcategory']) > 0:
        category_list += c_info['subcategory']
        category_list = list(set(category_list) & noun_set)
    
    i += 1

0 	category: 着色料
1 	category: アンドロゲン
2 	category: バニロイド
3 	category: ウイスキー
4 	category: 通貨記号
5 	category: 塩化物
6 	category: 解毒剤
7 	category: 錯体化学
8 	category: アミノグリコシド系抗生物質
9 	category: 肥料
10 	category: タンパク質
11 	category: アゾ染料
12 	category: 酒
13 	category: 受容体
14 	category: 細胞接着分子
15 	category: インドールアルカロイド
16 	category: 薬理学
17 	category: 有毒ガス
18 	category: 光学材料
19 	category: ピレスロイド
20 	category: 高分子
21 	category: 栄養素
22 	category: シクリトール
23 	category: 漂白剤
24 	category: 非ステロイド性抗炎症薬
25 	category: 安息香酸エステル
26 	category: ワクチン
27 	category: アストラゼネカ
28 	category: 大腸菌
29 	category: ヘプトース
30 	category: マグネシウム
31 	category: リガンド
32 	category: スタチン
33 	category: 燃料
34 	category: エネルギー貯蔵
35 	category: カチオン
36 	category: 電池
37 	category: 超酸化物
38 	category: 気分安定薬
39 	category: 置換反応
40 	category: ウラン
41 	category: リチウム
42 	category: 防腐剤
43 	category: 免疫抑制剤
44 	category: アヌレン
45 	category: 向精神薬
46 	category: バイオディーゼル
47 	category: 鋼
48 	category: パーキンソン病
49 	category: シクロアルケン
50 	category: 化学
51 	cat

In [126]:
compound_set = set(pd.read_csv("../data/compound_list.csv")['compound'].tolist())

In [132]:
noun_in_category_and_title = list((set(flatten(article_list) + searched_category) & noun_set) - compound_set)

In [135]:
pd.DataFrame(
    {"noun": noun_in_category_and_title}
).to_csv("../data/noun_list_in_category_and_title.csv", index=False)

In [136]:
len(noun_in_category_and_title)

2109