In [1]:
import requests
from pprint import pprint
import json
import re
import datetime
import calendar
import math
import warnings

import numpy as np
import pandas as pd
import geopandas as gpd
import geopandas.tools as gts
import folium
from shapely import wkt
warnings.simplefilter('ignore')


In [352]:
def query_views(name, y_start, m_start, y_end, m_end):
    lastday = calendar.monthrange(y_end, m_end)[1]
    headers = {"User-Agent": "***.gmail.com"}
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/ja.wikipedia.org/all-access/all-agents/{name}/monthly/{y_start}{m_start:02}01/{y_end}{m_end:02}{lastday}"
    response = requests.get(url, headers=headers).json()
    response = requests.get(url, headers=headers).json()
    views = {}
    try:
        for item in response["items"]:
            views["name"] = name
            views[item["timestamp"][:6]] = int(item["views"])
    except KeyError:
        return None

    return views


In [353]:
query_views("岸田文雄", 2022, 1, 2022, 12)


{'name': '岸田文雄',
 '202201': 170556,
 '202202': 113935,
 '202203': 154424,
 '202204': 85190,
 '202205': 180584,
 '202206': 137905,
 '202207': 420028,
 '202208': 261006,
 '202209': 152757,
 '202210': 184160,
 '202211': 130950,
 '202212': 130548}

In [2]:
sityouson_csv = pd.read_csv("./sityouson.csv")
prefs=[
    "北海道","青森県","岩手県","宮城県","秋田県","山形県","福島県","茨城県",
    "栃木県","群馬県","埼玉県","千葉県","東京都","神奈川県","新潟県","富山県",
    "石川県","福井県","山梨県","長野県","岐阜県","静岡県","愛知県","三重県",
    "滋賀県","京都府","大阪府","兵庫県","奈良県","和歌山県","鳥取県","島根県",
    "岡山県","広島県","山口県","徳島県","香川県","愛媛県","高知県","福岡県",
    "佐賀県","長崎県","熊本県","大分県","宮崎県","鹿児島県","沖縄県"
    ]

In [None]:
for i, pref in enumerate(prefs):
    views_list = []

    sityousons = list(sityouson_csv[sityouson_csv["都道府県名（漢字）"] == pref]["市区町村名（漢字）"])

    response = requests.get(f"https://ja.wikipedia.org/wiki/{pref}出身の人物一覧?action=cirrusdump")
    results = json.loads(response.text)[0]["_source"]["source_text"].split("\n")

    for result in results:
        # 人物について箇条書きで記載されている行のみ抽出
        if "<" in result:
            continue
        if ("* [[" not in result) and ("*[[" not in result):
            continue
        result = result.replace("：", ":")
        
        # 行中の人物名を正規表現を使って取得
        name = re.findall('(?<=\[\[).+?(?=\]\])', result)[0]
        name = name.split("|")[0]

        # 人物の出身地を取得
        place = None
        for sityouson in sityousons:
            if sityouson in result:
                place = sityouson
        if place is None:
            continue

        views = query_views(name, 2022, 1, 2022, 12)
        if views == None:
            continue
        views["pref"] = pref
        views["place"] = place

        views_list.append(views)

    
    if i == 0:
        df_views = pd.DataFrame(columns=list(views_list[0].keys()))
        df_views.to_csv("./views.csv", index=False)

    df_views = pd.DataFrame(columns=list(views_list[0].keys()))
    for views in views_list:
        df_views = df_views.append(views, ignore_index=True)
    df_views_saved = pd.read_csv("./views.csv")
    df_views = pd.concat([df_views_saved, df_views])
    df_views.to_csv("./views.csv", index=False)

    print(pref, len(views_list))


In [8]:
df_views = pd.read_csv("./views.csv")
df_views.head()


Unnamed: 0,name,202201,202202,202203,202204,202205,202206,202207,202208,202209,202210,202211,202212,pref,place
0,田村元,4561.0,3682.0,3021.0,3190.0,3313.0,3386.0,7561.0,4476.0,3963.0,3465.0,2963.0,3794.0,三重県,松阪市
1,浜田国松,2224.0,833.0,612.0,600.0,743.0,649.0,902.0,1003.0,790.0,734.0,775.0,1004.0,三重県,伊勢市
2,斎藤十朗,885.0,747.0,644.0,659.0,783.0,980.0,1364.0,1021.0,844.0,736.0,760.0,685.0,三重県,伊賀市
3,岡田克也,9597.0,9967.0,8750.0,7532.0,17326.0,13613.0,25701.0,50265.0,40057.0,20929.0,18153.0,10838.0,三重県,四日市市
4,川崎二郎,1811.0,1645.0,1437.0,1292.0,1249.0,1658.0,2295.0,2300.0,1592.0,1416.0,1548.0,1481.0,三重県,伊賀市


In [None]:
# viewsデータを、各市町村出身人物の中で最もview数が多い人のみ・2022年中の合計データとなるよう整形
df_views = df_views.drop_duplicates()
df_views = df_views.fillna(0)
df_views["sum_views"] = df_views.iloc[:, 1:13].sum(axis=1)
df_views = df_views.groupby(["pref", "place", "sum_views"]).name.first().reset_index()
tmp = df_views.groupby(["pref", "place"])["sum_views"].max().reset_index()
df_views_group = pd.merge(df_views[["name", "place", "sum_views"]], tmp, on=["place", "sum_views"])

# 市町村の座標を取得
df_geo = gts.geocode(df_views_group.apply(lambda x: f'{x["pref"]} {x["place"]}', axis=1),
                     provider='nominatim', user_agent='test')
df_views_group = pd.concat([df_views_group, df_geo], axis=1)

df_views_group.sort_values("sum_views", ascending=False).to_csv("./views_group.csv", index=False)


In [10]:
df_views_group = pd.read_csv("./views_group.csv")
df_views_group.head()


Unnamed: 0,name,place,sum_views,pref,geometry,address
0,安倍晋三,長門市,8012886.0,山口県,POINT (131.1821587 34.3708941),"長門市, 山口県, 日本"
1,安倍晋三,加古川市,8012886.0,兵庫県,POINT (134.8498955 34.786771),"加古川市, 兵庫県, 日本"
2,源頼朝,鴨川市,5507028.0,千葉県,POINT (140.098692 35.1140584),"鴨川市, 千葉県, 日本"
3,上島竜兵,丹波市,4011846.0,兵庫県,POINT (135.0363959 35.1771828),"丹波市, 兵庫県, 日本"
4,源頼家,鎌倉市,3739391.0,神奈川県,POINT (139.54442 35.329564),"鎌倉市, 神奈川県, 日本"


In [11]:
df_views_group = gpd.GeoDataFrame(df_views_group)
df_views_group["geometry"] = df_views_group["geometry"].apply(wkt.loads)
df_views_group = df_views_group[~df_views_group["geometry"].is_empty]


In [18]:
m = folium.Map(location=[35.6589, 139.7306], tiles='cartodbpositron', zoom_start=10)
for col in df_views_group.itertuples():
    folium.CircleMarker(
        (col.geometry.y, col.geometry.x),
        tooltip=f"{col.name}</br>{int(col.sum_views):,}",
        stroke=False,
        fillOpacity=0.2,
        fill_color='#8b0000',
        radius=min(max(col[3]/100000, 3), math.log(col[3], 1.1)/3),
    ).add_to(m)

m
