In [99]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

In [18]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_Wikipedias")
soup = BeautifulSoup(response.text)

In [52]:
# Get the header table names
headers = [header.text for header in soup.findAll("table")[2].findAll("th")]
lines = []

# Scrape the table of wikipedia language codes
for line in soup.findAll("table")[2].find("tbody").findAll("tr")[1:]:
    lines.append([td.text for td in line.findAll("td")])
    
len(lines)

In [54]:
wiki_lang_codes_df = pd.DataFrame(lines, columns=headers)
display(wiki_lang_codes_df.head())

Unnamed: 0,Language,Language (local),Wiki,Articles,Total,Edits,Admins,Users,Active users,Images,Depth
0,English,English,en,6185774,51861688,982621746,1123,40253563,130385,891140,1033
1,Cebuano,Cebuano,ceb,5350432,9825317,29958475,6,72896,168,0,2
2,Swedish,svenska,sv,3587709,7632177,48332470,59,749263,2362,0,8
3,German,Deutsch,de,2497237,6962028,203831825,191,3571871,18741,129406,93
4,French,français,fr,2264246,11006226,175823561,158,3924760,18994,62677,238


In [65]:
# Official languages. I recognize that countries have regional languages, but that was beyond the
# scope of the covid data, wikipedia data, and this course
response = requests.get("https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory")
soup = BeautifulSoup(response.text)

In [117]:
lang_country_dict = {}

for line in soup.findAll("table")[1].find("tbody").findAll("tr")[1:]:
    if line.find("a"):
        country = line.find("a").text
        languages = [lang.text for lang in line.findAll("li")]
        for lang in languages:
            clean_language = re.search("(\w+)?", lang).group(1)
            if clean_language in lang_country_dict:
                lang_country_dict[clean_language].append(country)
            else:
                lang_country_dict[clean_language] = [country]


In [122]:
wiki_lang_codes_df["Countries"] = wiki_lang_codes_df["Language"].map(lang_country_dict)

In [123]:
wiki_lang_codes_df

Unnamed: 0,Language,Language (local),Wiki,Articles,Total,Edits,Admins,Users,Active users,Images,Depth,Countries
0,English,English,en,6185774,51861688,982621746,1123,40253563,130385,891140,1033,"[Botswana, Brunei, Burundi, Cameroon, Canada, ..."
1,Cebuano,Cebuano,ceb,5350432,9825317,29958475,6,72896,168,0,2,[Philippines]
2,Swedish,svenska,sv,3587709,7632177,48332470,59,749263,2362,0,8,"[Finland, Finland]"
3,German,Deutsch,de,2497237,6962028,203831825,191,3571871,18741,129406,93,"[Belgium, Belgium, Brazil, Czech Republic, Ger..."
4,French,français,fr,2264246,11006226,175823561,158,3924760,18994,62677,238,"[Andorra, Belgium, Belgium, Burundi, Cameroon,..."
...,...,...,...,...,...,...,...,...,...,...,...,...
309,Nuosu,ꆇꉙ,ii,3,189,11653,1,2028,1,0,——,
310,Afar,Qafár af,aa,1,510,4685,1,4059,2,0,——,[Ethiopia]
311,Muscogee,Mvskoke,mus,1,115,3603,1,2345,1,0,——,
312,Herero,Otsiherero,hz,0,176,4483,1,3759,2,0,——,


In [124]:
wiki_lang_codes_df.to_csv("wiki_lang_codes_df.csv")

In [2]:
wiki_lang_codes = pd.read_csv("data/wiki_lang_codes.csv")

# make into integers
wiki_lang_codes['Users'] = wiki_lang_codes['Users'].apply(atof)


wiki_langs = wiki_lang_codes['Wiki']
langs = wiki_lang_codes['Language']

In [54]:
# did this manually
wiki_pandemic_lang_name = {
    "en": "COVID-19_pandemic",
    "es": "Pandemia_de_COVID-19",
    "fr": "Pandémie_de_Covid-19",
    "de": "COVID-19-Pandemie",
    "zh": "2019冠状病毒病",
    "ru": "Пандемия_COVID-19", 
    "pt": "Pandemia_de_COVID-19", 
    "it": "Pandemia_di_COVID-19_del_2019-2020",
    "ar": "جائحة_فيروس_كورونا_2019–20",
    "ja": "新型コロナウイルス感染症_(2019年)",
    "tr": "COVID-19_pandemisi",
    "id": "Pandemi_COVID-19",
    "nl": "Coronapandemie",
    "pl": "Pandemia_COVID-19",
    "simple": "COVID-19_pandemic",
    "fa": "دنیاگیری_کروناویروس",
    "vi": "Đại_dịch_COVID-19",
    "sv": "Coronaviruspandemin_2019–2020",
    "he": "מגפת_הקורונה",
    "ko": "코로나19_범유행"
}

In [59]:
for wiki_lang, lang in zip(wiki_langs[17:], langs[17:]):
    print("----------" + lang + "----------")
    
    if wiki_lang in wiki_pandemic_lang_name:
        search = wiki_pandemic_lang_name[wiki_lang]
    else:
        continue
        
    offset = None    
    dfs = []
    
    while True:
        endpoint = f"https://{wiki_lang}.wikipedia.org/w/index.php?title=Special:Export"
        if offset: query = f"&pages={search}&offset={offset}&action=submit"
        else: query = f"&pages={search}&action=submit"

        print(endpoint+query)

        response = requests.post(endpoint+query)
        doc = xmltodict.parse(response.text)
        
        if 'page' not in doc['mediawiki']:
            break
            
        df = pd.json_normalize(doc['mediawiki']['page']['revision'])
        dfs.append(df)

        offset = df.iloc[-1].timestamp

        if len(df) < 1000: break

    wiki_df = pd.concat(dfs)
    wiki_df.reset_index(inplace=True)
    wiki_df['timestamp'] = pd.to_datetime(wiki_df['timestamp'])
    wiki_df['wiki_lang'] = wiki_lang
    wiki_df['language'] = lang
    wiki_df.sort_values(by="timestamp", ascending=True, inplace=True)   
    
    print(len(wiki_df))
    
    wiki_df.to_csv(f"data/wiki_edits/wiki_edits_{wiki_lang}.csv")

----------Portuguese----------
https://pt.wikipedia.org/w/index.php?title=Special:Export&pages=Pandemia_de_COVID-19&action=submit
https://pt.wikipedia.org/w/index.php?title=Special:Export&pages=Pandemia_de_COVID-19&offset=2020-10-26T02:25:58Z&action=submit
1000
----------Persian----------
https://fa.wikipedia.org/w/index.php?title=Special:Export&pages=دنیاگیری_کروناویروس&action=submit
542
----------Catalan----------
----------Serbian----------
----------Indonesian----------
https://id.wikipedia.org/w/index.php?title=Special:Export&pages=Pandemi_COVID-19&action=submit
https://id.wikipedia.org/w/index.php?title=Special:Export&pages=Pandemi_COVID-19&offset=2020-05-07T22:14:30Z&action=submit
1099
----------Norwegian (Bokmål)----------
----------Korean----------
https://ko.wikipedia.org/w/index.php?title=Special:Export&pages=코로나19_범유행&action=submit
https://ko.wikipedia.org/w/index.php?title=Special:Export&pages=코로나19_범유행&offset=2020-03-06T10:39:19Z&action=submit
1350
----------Finnish------

In [None]:
sns.kdeplot(data=wiki_df, x="timestamp", bw_adjust=.3)
plt.title(f"Frequency of edits of {search} Wikipedia page")

In [None]:
wiki_df.to_csv("data/wiki_edits/wiki_edits_en.csv")

In [20]:
doc

OrderedDict([('mediawiki',
              OrderedDict([('@xmlns',
                            'http://www.mediawiki.org/xml/export-0.10/'),
                           ('@xmlns:xsi',
                            'http://www.w3.org/2001/XMLSchema-instance'),
                           ('@xsi:schemaLocation',
                            'http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd'),
                           ('@version', '0.10'),
                           ('@xml:lang', 'ceb'),
                           ('siteinfo',
                            OrderedDict([('sitename', 'Wikipedia'),
                                         ('dbname', 'cebwiki'),
                                         ('base',
                                          'https://ceb.wikipedia.org/wiki/Unang_Panid'),
                                         ('generator',
                                          'MediaWiki 1.36.0-wmf.14'),
                                         