In [2]:
from urllib import request
import json

import pandas as pd
from wmfdata import mariadb, hive, utils, charting

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


# Mobile pageviews per country for mobile-heavy wikis

In [17]:
iso_3166_url = "https://raw.githubusercontent.com/umpirsky/country-list/master/data/en/country.json"
country_codes = json.loads(
    request.urlopen(iso_3166_url).read().decode()
)

In [18]:
mh_wikis = {
    'hi.wikipedia': 'Hindi Wikipedia',
    'bn.wikipedia': 'Bangla Wikipedia',
    'id.wikipedia': 'Indonesian Wikipedia',
    'ar.wikipedia': 'Arabic Wikipedia',
    'mr.wikipedia': 'Marathi Wikipedia',
    'fa.wikipedia': 'Persian Wikipedia',
    'sw.wikipedia': 'Swahili Wikipedia',
    'tl.wikipedia': 'Tagalog Wikipedia',
    'zh.wikiquote': 'Chinese Wikiquote',
    'th.wikipedia': 'Thai Wikipedia',
    'arz.wikipedia': 'Egyptian Arabic Wikipedia',
    'ml.wikipedia': 'Malayalam Wikipedia',
    'ta.wikipedia': 'Tamil Wikipedia',
    'kn.wikipedia': 'Kannada Wikipedia',
    'pt.wiktionary': 'Portuguese Wiktionary',
    'az.wikipedia': 'Azerbaijani Wikipedia',
    'gu.wikipedia': 'Gujarati Wikipedia',
    'ky.wikipedia': 'Kyrgyz Wikipedia',
    'sq.wikipedia': 'Albanian Wikipedia',
    'ms.wikipedia': 'Malay Wikipedia'
}

mh_domains = mh_wikis.keys()
mh_sql_tuple = "(" + ", ".join('"{}"'.format(d) for d in mh_domains)  + ")"

In [19]:
mh_sql_tuple

'("ms.wikipedia", "az.wikipedia", "sw.wikipedia", "ar.wikipedia", "bn.wikipedia", "ky.wikipedia", "pt.wiktionary", "sq.wikipedia", "zh.wikiquote", "mr.wikipedia", "fa.wikipedia", "th.wikipedia", "hi.wikipedia", "id.wikipedia", "arz.wikipedia", "tl.wikipedia", "ml.wikipedia", "kn.wikipedia", "gu.wikipedia", "ta.wikipedia")'

In [42]:
mh_views = hive.run("""
select project, country_code, sum(view_count) as pageviews
from wmf.projectview_hourly
where
    ((year = 2017 and month >= 9) or (year = 2018 and month < 9)) and
    access_method in ("mobile web", "mobile app") and
    project in {projects}
group by project, country_code
""".format(projects = mh_sql_tuple))

In [50]:
mh_views_2 = mh_views

In [53]:
mh_views_2["project"] = mh_views_2["project"].apply(mh_wikis.get)

In [81]:
mh_views_2["country_code"] = mh_views_2["country_code"].apply(country_codes.get)

In [95]:
piv = mh_views_2.pivot_table("pageviews", index="project", columns="country_code", aggfunc="sum", fill_value=0)

In [97]:
countries_sorted = piv.apply("sum", axis=0).sort_values(ascending=False).index

piv = piv.reindex(labels=countries_sorted, axis="columns")

In [103]:
with pd.option_context(
    'display.max_rows', None, 
    'display.max_columns', None,
    'display.max_colwidth', 500,
):
    display(piv.iloc[:, :25])

country_code,Indonesia,Iran,United States,India,Thailand,Saudi Arabia,Egypt,Morocco,Algeria,Iraq,Malaysia,United Arab Emirates,Jordan,Germany,Azerbaijan,United Kingdom,Kuwait,Netherlands,Bangladesh,Syria,Sudan,Philippines,Tunisia,Palestinian Territories,Lebanon
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
Albanian Wikipedia,15447,8713,3396898,65797,5813,11405,4935,8747,3929,11415,4315,9708,1746,1285392,6849,596130,1903,64473,3807,503,565,4394,1556,347,1102
Arabic Wikipedia,1151769,4006381,167810957,2338849,508856,353895675,203461036,171389017,114022000,89726984,1614239,41121271,69487144,24336244,245688,16465092,50670514,5821996,229254,44536036,36370391,192834,31676239,29754414,27487111
Azerbaijani Wikipedia,43929,194307,5550842,262200,19262,21945,22779,76259,21246,33650,10834,35866,6370,905263,57103803,1070811,5987,21865316,15175,3704,4454,6741,7068,1505,5028
Bangla Wikipedia,109311,8549,62374318,17588054,6768,745487,6779,21187,6046,23141,333094,231710,18053,355154,1372,3827503,195555,50011,45444666,234,4129,2777,892,163,22834
Chinese Wikiquote,2660,662,332425,2734,5047,473,570,1890,653,336,53932,661,111,104393,79,18794,108,4212,724,37,75,2159,151,15,64
Egyptian Arabic Wikipedia,16764,112533,3942348,39869,5301,2453622,6394481,1255516,1188503,1225630,20817,382221,780553,447526,2114,264018,557770,133703,2175,624326,394582,2272,362210,342224,408114
Gujarati Wikipedia,4867,627,2201941,7966028,1177,1723,252,1076,248,178,396,2400,66,102185,62,33215,913,1567,754,30,105,411,67,5,45
Hindi Wikipedia,446175,14139,84437246,407428758,31238,316923,4587,17324,8498,6126,51018,251392,4653,316896,1107,412549,113930,43555,28372,667,1832,4451,2074,597,1686
Indonesian Wikipedia,906065426,90411,279737727,5756010,465882,590141,345732,115953,61941,125470,13131952,154597,51542,1596125,25924,3228175,43659,670726,97645,11890,48414,184358,31134,24471,44318
Kannada Wikipedia,8992,850,4390847,14166202,1003,7653,300,1764,367,262,879,9001,77,103961,70,36684,1990,3881,1052,38,229,354,76,10,39


# Monthly editors per country

In [52]:
toby_countries = [
    "China", "India", "United States", "Indonesia", "Brazil", "Pakistan", "Nigeria",
    "Bangladesh", "Russia", "Mexico", "Japan", "Philippines", "Ethiopia", "Vietnam",
    "Egypt", "Germany", "Iran", "Turkey", "Democratic Republic of the Congo",
    "Thailand", "United Kingdom", "France", "Italy", "Tanzania", "South Africa",
    "Myanmar", "South Korea", "Colombia", "Kenya", "Spain", "Ukraine", "Argentina",
    "Sudan", "Algeria", "Uganda", "Poland", "Iraq", "Canada", "Morocco",
    "Afghanistan", "Saudi Arabia", "Peru", "Venezuela", "Malaysia", "Uzbekistan",
    "Nepal", "Mozambique", "Ghana", "Yemen", "Angola", "Madagascar", "Australia",
    "Cameroon", "Ivory Coast", "Sri Lanka", "Niger", "Romania", "Burkina Faso",
    "Syria", "Mali", "Chile", "Kazakhstan", "Malawi", "Netherlands", "Zambia",
    "Guatemala", "Ecuador", "Zimbabwe", "Cambodia", "Senegal", "Chad", "Guinea",
    "South Sudan", "Rwanda", "Burundi", "Cuba", "Tunisia", "Belgium", "Benin",
    "Somalia", "Greece", "Bolivia", "Haiti", "Dominican Republic", "Czech Republic",
    "Portugal", "Azerbaijan", "Sweden", "Hungary", "Belarus", "United Arab Emirates",
    "Serbia", "Tajikistan", "Austria", "Switzerland", "Israel",
    "Honduras", "Papua New Guinea", "Jordan", "Togo", "Hong Kong", "Bulgaria",
    "Paraguay", "Sierra Leone", "Libya", "Nicaragua", "El Salvador", "Kyrgyzstan",
    "Lebanon", "Singapore", "Denmark", "Finland", "Turkmenistan", "Slovakia",
    "Eritrea", "Norway", "Central African Republic", "Costa Rica",
    "Republic of the Congo", "Ireland", "Oman", "Liberia", "New Zealand", "Croatia", "Mauritania",
    "Moldova", "Kuwait", "Panama", "Georgia", "Bosnia and Herzegovina", "Puerto Rico",
    "Uruguay", "Armenia", "Mongolia", "Albania", "Lithuania", "Jamaica",
    "Namibia", "Botswana", "Qatar", "Lesotho", "Macedonia", "Slovenia", "Gambia",
    "Latvia", "Guinea-Bissau", "Gabon", "Bahrain", "Trinidad and Tobago", "Estonia",
    "Swaziland", "Mauritius", "East Timor", "Cyprus", "Djibouti", "Fiji",
    "Equatorial Guinea", "Comoros", "Bhutan", "Guyana", "Montenegro", 
    "Solomon Islands", "Luxembourg", "Suriname", "Malta", "Bahamas", "Maldives", "Belize",
    "Iceland", "French Polynesia", "Barbados", "Vanuatu", "New Caledonia", "Samoa",
    "Sao Tome and Principe", "Saint Lucia", "Guam", "Kiribati", "Grenada", "Tonga",
    "United States Virgin Islands", "Aruba", "Seychelles", "Antigua and Barbuda",
    "Dominica", "Andorra", "Bermuda", "Cayman Islands", "Greenland",
    "Saint Kitts and Nevis", "Marshall Islands", "Monaco", "Liechtenstein", "Taiwan", "Unknown"
]

In [45]:
co_eds_r = hive.run("""
select
    country_code,
    sum(distinct_editors) / 8 as monthly_editors,
    sum(if(users_are_anonymous = false, distinct_editors, 0)) / 8 as monthly_registered_editors
from wmf.geoeditors_monthly
where 
    month >= "2018-01" and
    month < "2018-09"
group by country_code
""")

co_eds = co_eds_r.copy()
co_eds["country_code"] = co_eds["country_code"].apply(country_codes.get)

co_eds.head()

Unnamed: 0,country_code,monthly_editors,monthly_registered_editors
0,,10109.25,9488.75
1,Andorra,71.0,22.875
2,United Arab Emirates,3204.75,950.125
3,Afghanistan,381.0,144.375
4,Antigua & Barbuda,20.75,5.875


In [48]:
co_eds.query("country_code.isnull()")

Unnamed: 0,country_code,monthly_editors,monthly_registered_editors
0,,10109.25,9488.75
34,,0.125,0.125


In [49]:
co_eds = co_eds.drop([34])

In [69]:
toby_transforms = {
    "Antigua & Barbuda": "Antigua and Barbuda",
    "Bosnia & Herzegovina": "Bosnia and Herzegovina",
    "Myanmar (Burma)": "Myanmar",
    "Trinidad & Tobago": "Trinidad and Tobago",
    "Congo - Kinshasa": "Democratic Republic of the Congo",
    "Congo - Brazzaville": "Republic of the Congo",
    "Côte d’Ivoire": "Ivory Coast",
    "Czechia": "Czech Republic",
    "Hong Kong SAR China": "Hong Kong",
    "São Tomé & Príncipe": "Sao Tome and Principe",
    "Timor-Leste": "East Timor",
    None: "Unknown"
}

In [70]:
co_eds = co_eds.replace(to_replace=toby_transforms)

In [71]:
with pd.option_context(
    'display.max_rows', None, 
    'display.max_columns', None,
    'display.max_colwidth', 500,
):
    display(co_eds.query("country_code not in @toby_countries").sort_values("monthly_editors", ascending=False))

Unnamed: 0,country_code,monthly_editors,monthly_registered_editors
145,Macau SAR China,450.375,120.75
180,Palestinian Territories,431.75,141.875
185,Réunion,264.375,95.75
28,Brunei,126.5,27.125
123,Laos,120.5,32.625
87,Guadeloupe,116.75,27.125
147,Martinique,106.5,32.75
102,Isle of Man,104.75,28.125
109,Jersey,77.125,27.75
74,Faroe Islands,66.25,23.125


In [73]:
toby_co_eds = co_eds.set_index("country_code").reindex(toby_countries)
toby_co_eds.head(10)

Unnamed: 0_level_0,monthly_editors,monthly_registered_editors
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1
China,6213.875,2978.25
India,59107.0,18885.25
United States,199287.625,64144.0
Indonesia,11170.25,3839.25
Brazil,34509.875,7805.375
Pakistan,5712.375,1931.375
Nigeria,1699.5,1025.5
Bangladesh,4348.875,2017.375
Russia,42076.0,12119.5
Mexico,18861.0,3504.25


In [88]:
toby_co_eds.loc["Taiwan"]

monthly_editors               17995.00
monthly_registered_editors     4505.75
Name: Taiwan, dtype: float64

# Monthly unique devices per country

In [90]:
co_uds_r = hive.run("""
select
    country_code,
    country,
    sum(uniques_estimate) / 12 as monthly_unique_devices
from wmf.unique_devices_per_domain_monthly
where
    ((year = 2017 and month >= 9) or (year = 2018 and month < 9))
group by country_code, country
""")

In [97]:
co_uds = co_uds_r.drop("country", axis=1)

In [99]:
co_uds["country_code"] = co_uds["country_code"].apply(country_codes.get)

In [101]:
co_uds = co_uds.replace(to_replace=toby_transforms)

In [122]:
toby_co_uds = co_uds.groupby("country_code").sum().reindex(labels=toby_countries, axis="index")

In [123]:
toby_co_uds.head(10)

Unnamed: 0_level_0,monthly_unique_devices
country_code,Unnamed: 1_level_1
China,28707960.0
India,117471600.0
United States,343337800.0
Indonesia,35083750.0
Brazil,60444630.0
Pakistan,8057592.0
Nigeria,5066659.0
Bangladesh,4326530.0
Russia,69135040.0
Mexico,44474410.0
