In [1]:
# Setup packages and file paths 
import requests
import pandas as pd
import time 

path = "/home/teijehidde/Documents/Git Blog and Coding/data_dump/"
data_file = "data_new6.json"

In [4]:
def downloadWikiNetwork(node_title, lang = "en"):
    api_endpoint = "https://" + lang + ".wikipedia.org/w/api.php" # fr.wikipedia.org; https://en.wikipedia.org
    wiki_data = []
    print("Starting download of network " + node_title + " in language " + lang + ".")
    
    # 1: download data on the node_title wikipage.
    S = requests.Session()
    params_node_title = {
        "action": "query",
        "titles": node_title,
        "prop": "info|links|langlinks", 
        "plnamespace": 0, 
        "pllimit": 500,
        "lllimit": 500, 
        "format": "json"
    }
    response = S.get(url=api_endpoint, params=params_node_title)
    wiki_data.append(response.json())
    
    while 'continue' in wiki_data[-1].keys():
        params_cont = params_node_title
        if 'plcontinue' in wiki_data[-1]['continue']:
            params_cont["plcontinue"] = wiki_data[-1]['continue']['plcontinue'] 
            print('plcontinue: ' + params_cont["plcontinue"])

        elif 'llcontinue' in wiki_data[-1]['continue']:
            params_cont["llcontinue"] = wiki_data[-1]['continue']['llcontinue'] 
            print('llcontinue: ' + params_cont["llcontinue"])

        response = S.get(url=api_endpoint, params=params_cont)
        wiki_data.append(response.json())

    # 2: download data on the links of node_title wikipage.
    S = requests.Session()
    params_network_title = {
        "action": "query",
        "generator": "links",
        "titles": node_title,
        "gplnamespace": 0, 
        "gpllimit": 500, 
        "prop": "info|links|langlinks", 
        "plnamespace": 0,
        "pllimit": 500,
        "lllang": "en", 
        "lllimit":500,
        "format": "json"
    }
    response = S.get(url=api_endpoint, params=params_network_title)
    wiki_data.append(response.json())

    while 'continue' in wiki_data[-1].keys():
        params_cont = params_network_title
        if 'plcontinue' in wiki_data[-1]['continue']:
            params_cont["plcontinue"] = wiki_data[-1]['continue']['plcontinue'] 
            print('plcontinue: ' + params_cont["plcontinue"])

        elif 'llcontinue' in wiki_data[-1]['continue']:
            params_cont["llcontinue"] = wiki_data[-1]['continue']['llcontinue'] 
            print('llcontinue: ' + params_cont["llcontinue"])

        elif 'gplcontinue' in wiki_data[-1]['continue']: 
            params_cont["gplcontinue"] = wiki_data[-1]['continue']['gplcontinue']
            print('gplcontinue: ' + params_cont["gplcontinue"])

        response = S.get(url=api_endpoint, params = params_cont)
        wiki_data.append(response.json())

    # 3: creating list of available nodes in wiki_data. 
    all_nodes = []

    for item in wiki_data:
        all_nodes = all_nodes + list(item['query']['pages'].keys())
    all_nodes = list(set(all_nodes))
    all_nodes = [int(i) for i in all_nodes if int(i) > 0]
    all_nodes = [str(i) for i in all_nodes]
    
    network_data_df = pd.DataFrame(
        columns = ['title', 'lang', 'pageid', 'uniqueid', 'lastrevid', 'links', 'langlinks', 'ego'], # complete list: ['ns', 'title', 'missing', 'contentmodel', 'pagelanguage', 'pagelanguagehtmlcode', 'pageid', 'lastrevid', 'length', 'links', 'langlinks']
        index = all_nodes)

    # 4: Using all_nodes to go through list of raw data from API. 
    for node in all_nodes:
        network_data_df.at[node,'links'] = []
        network_data_df.at[node,'langlinks'] = []
        for item in wiki_data:
            if node in item['query']['pages'].keys(): # possibility:  df_new_wiki_data.update(item) #, errors = 'raise') 
                network_data_df.at[node, 'title'] = item['query']['pages'][node]['title']
                network_data_df.at[node,'lang'] = item['query']['pages'][node]['pagelanguage']
                network_data_df.at[node,'pageid'] = item['query']['pages'][node]['pageid']
                network_data_df.at[node,'uniqueid'] = network_data_df.at[node,'lang'] + str(network_data_df.at[node,'pageid'])
                network_data_df.at[node,'lastrevid'] = item['query']['pages'][node]['lastrevid']
                if network_data_df.at[node,'title'] == node_title.replace('_', ' '): network_data_df.at[node,'ego'] = True

                if 'links' in item['query']['pages'][node].keys():
                    for link in item['query']['pages'][node]['links']:
                        network_data_df.at[node,'links'].append(link['title'])

                if 'langlinks' in item['query']['pages'][node].keys(): 
                    network_data_df.at[node,'langlinks'] = network_data_df.at[node,'langlinks'] + item['query']['pages'][node]['langlinks']

    # returns panda with all data from network. 
    return (network_data_df, wiki_data)

In [3]:
# Function B: Downloading multiple languages of one topic and saving them to json/panda file. 
def downloadMultiLangWikiNetwork(node_title, original_lang = 'en', additional_langs = ["ar", "ja", "es", "zh", "fr", "ru"]): # or: 'available_langs'
    network_data_df = downloadWikiNetwork(node_title=node_title, lang=original_lang)
    available_langs = network_data_df.loc[network_data_df['ego'] == True]['langlinks'].values.tolist()[0]

    if additional_langs == []:
        print('The wikipedia page is available in the following languages:')         
        print(available_langs)
    
    else:
        for item in available_langs: 
            if item['lang'] in additional_langs:
                network_data_df_additional = downloadWikiNetwork(node_title = item['*'], lang = item['lang'])
                network_data_df = pd.concat([network_data_df, network_data_df_additional], ignore_index=True, sort=False)
                
    try: 
        network_data_saved = pd.read_json((path + data_file), orient='split')
    except:
        network_data_saved = None
    network_data_df = pd.concat([network_data_df, network_data_saved], ignore_index=True, sort=False)
    network_data_df = network_data_df.loc[network_data_df.astype(str).drop_duplicates(subset=['title', 'lang', 'ego'], keep = 'first').index].reset_index(drop=True)
    network_data_df.to_json((path + data_file), orient='split')
    print("Download of network and additional languages finished. Returning to main menu...") 
    time.sleep(5) 

In [6]:
wiki_network = downloadWikiNetwork('Gender_parity')

Starting download of network Gender_parity in language en.
plcontinue: 358|0|Hassiba_Boulmerka
plcontinue: 358|0|Sétif
plcontinue: 3415|0|Glacial_landform
plcontinue: 3415|0|Sport_in_Bulgaria
plcontinue: 3462|0|Inter_Press_Service
plcontinue: 5478|0|Azerbaijan
plcontinue: 5478|0|Protestant
plcontinue: 5488|0|Hippopotamus
plcontinue: 5593|0|Australia_Group
plcontinue: 5593|0|List_of_earthquakes_in_Cyprus
plcontinue: 8620|0|Eko_Eko_Azarak
plcontinue: 10577|0|Bilateral_trade
plcontinue: 10577|0|Lactose_intolerant
plcontinue: 10577|0|Tampere_University
plcontinue: 10796|0|Postcolonial_feminism
plcontinue: 11107|0|Quebec_City
plcontinue: 11185|0|Domestic_violence
plcontinue: 11185|0|Museum_of_Contemporary_Art,_Los_Angeles
plcontinue: 11257|0|Beatrice_the_Sixteenth
plcontinue: 11257|0|Science_fiction
plcontinue: 11408|0|Feminist_rhetoric
plcontinue: 11408|0|Women's_suffrage_in_New_Zealand
plcontinue: 11587|0|Hélène_Cixous
plcontinue: 11587|0|William_Lane_Craig
plcontinue: 11867|0|Economy_of_

In [18]:
wiki_network[1][0]

{'batchcomplete': '',
 'query': {'normalized': [{'from': 'Gender_parity', 'to': 'Gender parity'}],
  'pages': {'65331440': {'pageid': 65331440,
    'ns': 0,
    'title': 'Gender parity',
    'contentmodel': 'wikitext',
    'pagelanguage': 'en',
    'pagelanguagehtmlcode': 'en',
    'pagelanguagedir': 'ltr',
    'touched': '2021-08-19T02:22:55Z',
    'lastrevid': 1035826258,
    'length': 25457,
    'links': [{'ns': 0, 'title': "African-American women's suffrage movement"},
     {'ns': 0, 'title': 'Africana womanism'},
     {'ns': 0, 'title': 'Algeria'},
     {'ns': 0, 'title': 'Analytical feminism'},
     {'ns': 0, 'title': 'Anarcha-feminism'},
     {'ns': 0, 'title': 'Anti-abortion feminism'},
     {'ns': 0, 'title': 'Antifeminism'},
     {'ns': 0, 'title': 'Asian feminist theology'},
     {'ns': 0, 'title': 'Atheist feminism'},
     {'ns': 0, 'title': 'Bicycling and feminism'},
     {'ns': 0, 'title': 'Black feminism'},
     {'ns': 0, 'title': 'Bolivia'},
     {'ns': 0, 'title': 'Bud

In [19]:
wiki_network_df = wiki_network[0]

wiki_network_df.loc[wiki_network_df['title'] == 'Gender parity']

Unnamed: 0,title,lang,pageid,uniqueid,lastrevid,links,langlinks,ego
65331440,Gender parity,en,65331440,en65331440,1035826258,"[African-American women's suffrage movement, A...","[{'lang': 'ar', '*': 'التكافؤ بين الجنسين'}, {...",


In [110]:
network_data_df

Unnamed: 0,title,lang,pageid,uniqueid,lastrevid,links,langlinks,ego
0,Cgroups,en,28942492,en28942492,1032278529,[Adaptive Domain Environment for Operating Sys...,[],
1,Linux kernel,en,21347315,en21347315,1037151397,"[/sys, 0 A.D. (video game), 64-bit, AArch64, A...",[],
2,PearPC,en,656612,en656612,1019446604,"[3Com, 64-bit, Adaptive Domain Environment for...",[],
3,Containerd,en,64656490,en64656490,972183911,[Cloud Native Computing Foundation],[],
4,GLinux,en,56356171,en56356171,1035653157,"[/e/ (operating system), 4MLinux, ALT Linux, A...",[],
...,...,...,...,...,...,...,...,...
582,Solaris Containers,zh,4242669,zh4242669,63145567,"[Chroot, OpenSolaris, SPARC, Solaris 10, Solar...","[{'lang': 'en', '*': 'Solaris Containers'}, {'...",
583,软件版本周期,zh,1048013,zh1048013,10039555,[軟件版本週期],[],
584,Apache许可证,zh,740492,zh740492,64115378,"[AROS Research Operating System, Abdera, Accum...","[{'lang': 'en', '*': 'Apache License'}, {'lang...",
585,Docker (軟體),zh,6322282,zh6322282,58844869,[Docker],[],


In [111]:
selected_network = 'Kubernetes'
page_langs = pd.DataFrame(network_data_df.loc[network_data_df['ego'] == True].loc[network_data_df['lang'] == 'en'].loc[network_data_df['title'] == selected_network]['langlinks'].iloc[0]).rename(columns={'*': 'title'})
saved_langs = network_data_df.loc[network_data_df['ego'] == True][['lang', 'title']]
options_langs =  pd.merge(saved_langs, page_langs, how ='inner', on =['lang', 'title'])

language_options = ["{}: {}".format(list(a.values())[0], list(a.values())[1]) for a in options_langs]
[{'label': a, 'value': a} for a in language_options]

AttributeError: 'str' object has no attribute 'values'

In [5]:
input_var = 'teenNick'
input_var[0].upper() + input_var[1:]

'TeenNick'

In [112]:
options_langs

Unnamed: 0,lang,title
0,ar,كوبيرنيتيس
1,es,Kubernetes
2,fr,Kubernetes
3,ja,Kubernetes
4,ru,Kubernetes
5,zh,Kubernetes


In [102]:
options_lang = pd.merge(saved_langs, page_langs, how ='inner', on =['lang', 'title']).to_dict('records')

In [113]:
options_lang + ['lang': lang, title: ]

[{'lang': 'ar', 'title': 'كوبيرنيتيس'},
 {'lang': 'es', 'title': 'Kubernetes'},
 {'lang': 'fr', 'title': 'Kubernetes'},
 {'lang': 'ja', 'title': 'Kubernetes'},
 {'lang': 'ru', 'title': 'Kubernetes'},
 {'lang': 'zh', 'title': 'Kubernetes'}]

In [103]:
language_options = ["{}: {}".format(list(a.values())[0], list(a.values())[1]) for a in options_lang]

In [104]:
[{'label': a, 'value': a} for a in language_options]

[{'label': 'ar: كوبيرنيتيس', 'value': 'ar: كوبيرنيتيس'},
 {'label': 'es: Kubernetes', 'value': 'es: Kubernetes'},
 {'label': 'fr: Kubernetes', 'value': 'fr: Kubernetes'},
 {'label': 'ja: Kubernetes', 'value': 'ja: Kubernetes'},
 {'label': 'ru: Kubernetes', 'value': 'ru: Kubernetes'},
 {'label': 'zh: Kubernetes', 'value': 'zh: Kubernetes'}]

In [5]:

# Kubernetes, Vaccine, Oxford, [Verkhovna_Rada this one seems to have a bug - fix later; see also 'Consumer_protection'] 
# wiki_data_kubs = downloadWikiNetwork('Kubernetes')
# wiki_data_Verkhovna_Rada = downloadWikiNetwork('Verkhovna_Rada')

In [105]:
network_data_df.loc[network_data_df['ego'] == True].loc[network_data_df['lang'] == 'en']['title'].values.tolist()

['Kubernetes', 'Kubernetes', 'Snowmelt', 'Terrorism', 'Secularism', 'Vaccine']

In [25]:
# all_nodes

In [26]:
for item in wiki_data['query']['pages']: 
    if '2646623' in item['query']['pages'].keys():
       if 'links' in item['query']['pages']['2646623'].keys(): 
           print(item)

TypeError: list indices must be integers or slices, not str