In [3]:
# Setup packages and file paths 
import requests
import pandas as pd
import time 

path = "/home/teijehidde/Documents/Git Blog and Coding/data_dump/"
data_file = "data_new4.json"

In [4]:
def downloadWikiNetwork(node_title, lang = "en"):
    api_endpoint = "https://" + lang + ".wikipedia.org/w/api.php" # fr.wikipedia.org; https://en.wikipedia.org
    wiki_data = []
    print("Starting download of network " + node_title + " in language " + lang + ".")
    
    # 1: download data on the node_title wikipage.
    S = requests.Session()
    params_node_title = {
        "action": "query",
        "titles": node_title,
        "prop": "info|links|langlinks", 
        "plnamespace": 0, 
        "pllimit": 500,
        "lllimit": 500, 
        "format": "json"
    }
    response = S.get(url=api_endpoint, params=params_node_title)
    wiki_data.append(response.json())
    
    while 'continue' in wiki_data[-1].keys():
        params_cont = params_node_title
        if 'plcontinue' in wiki_data[-1]['continue']:
            params_cont["plcontinue"] = wiki_data[-1]['continue']['plcontinue'] 
            print('plcontinue: ' + params_cont["plcontinue"])

        elif 'llcontinue' in wiki_data[-1]['continue']:
            params_cont["llcontinue"] = wiki_data[-1]['continue']['llcontinue'] 
            print('llcontinue: ' + params_cont["llcontinue"])

        response = S.get(url=api_endpoint, params=params_cont)
        wiki_data.append(response.json())

    # 2: download data on the links of node_title wikipage.
    S = requests.Session()
    params_network_title = {
        "action": "query",
        "generator": "links",
        "titles": node_title,
        "gplnamespace": 0, 
        "gpllimit": 500, 
        "plnamespace": 0,
        "pllimit": 500, 
        "prop": "info|links|langlinks", 
        "format": "json"
    }
    response = S.get(url=api_endpoint, params=params_network_title)
    wiki_data.append(response.json())

    while 'continue' in wiki_data[-1].keys():
        params_cont = params_network_title
        if 'plcontinue' in wiki_data[-1]['continue']:
            params_cont["plcontinue"] = wiki_data[-1]['continue']['plcontinue'] 
            print('plcontinue: ' + params_cont["plcontinue"])

        elif 'llcontinue' in wiki_data[-1]['continue']:
            params_cont["llcontinue"] = wiki_data[-1]['continue']['llcontinue'] 
            print('llcontinue: ' + params_cont["llcontinue"])

        elif 'gplcontinue' in wiki_data[-1]['continue']: 
            params_cont["gplcontinue"] = wiki_data[-1]['continue']['gplcontinue']
            print('gplcontinue: ' + params_cont["gplcontinue"])

        response = S.get(url=api_endpoint, params = params_cont)
        wiki_data.append(response.json())

    return wiki_data

In [49]:
def tempfunction(wiki_data):     
    # 3: creating list of available nodes in wiki_data. 
    node_title = 'Royston'
    all_nodes = []

    for item in wiki_data:
        all_nodes = all_nodes + list(item['query']['pages'].keys())
    all_nodes = list(set(all_nodes))
    all_nodes = [int(i) for i in all_nodes if int(i) > 0]
    all_nodes = [str(i) for i in all_nodes]
    
    network_data_df = pd.DataFrame(
        columns = ['title', 'lang', 'pageid', 'uniqueid', 'lastrevid', 'links', 'langlinks', 'ego'], # complete list: ['ns', 'title', 'missing', 'contentmodel', 'pagelanguage', 'pagelanguagehtmlcode', 'pageid', 'lastrevid', 'length', 'links', 'langlinks']
        index = all_nodes)

    # 4: Using all_nodes to go through list of raw data from API. 
    for node in all_nodes:
        network_data_df.at[node,'links'] = []
        for item in wiki_data:
            if node in item['query']['pages'].keys(): # possibility:  df_new_wiki_data.update(item) #, errors = 'raise') 
                network_data_df.at[node, 'title'] = item['query']['pages'][node]['title']
                network_data_df.at[node,'lang'] = item['query']['pages'][node]['pagelanguage']
                network_data_df.at[node,'pageid'] = item['query']['pages'][node]['pageid']
                network_data_df.at[node,'uniqueid'] = network_data_df.at[node,'lang'] + str(network_data_df.at[node,'pageid'])
                network_data_df.at[node,'lastrevid'] = item['query']['pages'][node]['lastrevid']
                if network_data_df.at[node,'title'] == node_title: network_data_df.at[node,'ego'] = True

                if 'links' in item['query']['pages'][node].keys():
                    for link in item['query']['pages'][node]['links']:
                        network_data_df.at[node,'links'].append(link['title'])

                if 'langlinks' in item['query']['pages'][node].keys(): 
                    network_data_df.at[node,'langlinks'] = item['query']['pages'][node]['langlinks']

    # returns panda with all data from network. 
    return network_data_df


In [50]:
# Function B: Downloading multiple languages of one topic and saving them to json/panda file. 
def downloadMultiLangWikiNetwork(node_title, original_lang = 'en', additional_langs = ["ar", "de", "fr", "nl"]): # or: 'available_langs'
    network_data_df = downloadWikiNetwork(node_title=node_title, lang=original_lang)
    available_langs = network_data_df.loc[network_data_df['langlinks'].notnull()]['langlinks'].values.tolist()[0]

    if additional_langs == []:
        print('The wikipedia page is available in the following languages:')         
        print(available_langs)
    
    else:
        for item in available_langs: 
            if item['lang'] in additional_langs:
                network_data_df_additional = downloadWikiNetwork(node_title = item['*'], lang = item['lang'])
                network_data_df = pd.concat([network_data_df, network_data_df_additional], ignore_index=True, sort=False)
                
    try: 
        network_data_saved = pd.read_json((path + data_file), orient='split')
    except:
        network_data_saved = None
    network_data_df = pd.concat([network_data_df, network_data_saved], ignore_index=True, sort=False)
    network_data_df = network_data_df.loc[network_data_df.astype(str).drop_duplicates(subset=['title', 'lang', 'pageid', 'ego'], keep = 'first').index].reset_index(drop=True)
    network_data_df.to_json((path + data_file), orient='split')
    print("Download of network and additional languages finished. Returning to main menu...") 
    time.sleep(5) 

In [51]:
def save_data(network_data_df): 
    try: 
        network_data_saved = pd.read_json((path + data_file), orient='split')
    except:
        network_data_saved = None
    network_data_df = pd.concat([network_data_df, network_data_saved], ignore_index=True, sort=False)
    network_data_df = network_data_df.loc[network_data_df.astype(str).drop_duplicates(subset=['title', 'lang', 'pageid', 'ego'], keep = 'first').index].reset_index(drop=True)
    network_data_df.to_json((path + data_file), orient='split')
    print("Download of network and additional languages finished. Returning to main menu...") 
    time.sleep(5) 

In [25]:
data_royston = downloadWikiNetwork('Royston')

Starting download of network Royston in language en.
plcontinue: 196110|0|Bibcode_(identifier)
plcontinue: 279710|0|Royston_Crow_(newspaper)
plcontinue: 2661324|0|Corkerhill
plcontinue: 7819452|0|Ankle
plcontinue: 8569916|0|Filipinos
plcontinue: 8569916|0|Phrasal_verb
plcontinue: 18816811|0|Gilead
llcontinue: 92679|cy
llcontinue: 92679|ga
llcontinue: 92679|la
llcontinue: 92679|pnb
llcontinue: 92679|sv
llcontinue: 92679|zh
llcontinue: 110106|es
llcontinue: 110106|sh
llcontinue: 196110|an
llcontinue: 196110|ca
llcontinue: 196110|eo
llcontinue: 196110|gl
llcontinue: 196110|ja
llcontinue: 196110|mk
llcontinue: 196110|pl
llcontinue: 196110|se
llcontinue: 196110|udm
llcontinue: 279710|azb
llcontinue: 279710|nl
llcontinue: 694254|es
llcontinue: 3595319|arz
llcontinue: 7819452|cs
llcontinue: 7819452|hy
llcontinue: 7819452|pl
llcontinue: 7819452|zh
llcontinue: 8569916|ang
llcontinue: 8569916|az
llcontinue: 8569916|bh
llcontinue: 8569916|ca
llcontinue: 8569916|cs
llcontinue: 8569916|dty
llcontin

In [52]:
temp_data = tempfunction(data_royston)

In [47]:
temp_data

Unnamed: 0,title,lang,pageid,uniqueid,lastrevid,links,langlinks,ego
63483274,Roy Goodacre,en,63483274,en63483274,1033217167,"[Aberystwyth University, Analyst (journal), Bi...",,False
18816811,Ivor Royston,en,18816811,en18816811,1035773812,"[Amylin, Asia, Atlanta, Georgia, Bethesda, Mar...","[{'lang': 'azb', '*': 'ایوور رویستون'}]",False
34198136,"Royston, Texas",en,34198136,en34198136,1029889303,"[City, County seat, Fisher County, Texas, Geog...",,False
279710,"Royston, Hertfordshire",en,279710,en279710,1030404019,"[A10 road (Great Britain), A1198 road, A1 road...","[{'lang': 'nl', '*': 'Royston (Hertfordshire)'...",False
2661324,"Royston, Glasgow",en,2661324,en2661324,1030864266,"[Ancient Order of Hibernians, Anderston, Annie...","[{'lang': 'eu', '*': 'Royston (Glasgow)'}, {'l...",False
33405973,Mac Evans,en,33405973,en33405973,994453478,"[Adelaide, Association football in Western Aus...","[{'lang': 'ar', '*': 'ماك إيفانز'}, {'lang': '...",False
7819452,Royston Drenthe,en,7819452,en7819452,1032137870,"[2005–06 Eredivisie, 2005–06 Feyenoord season,...","[{'lang': 'zh', '*': '罗伊斯顿·德伦特'}, {'lang': 'zh...",False
92679,South Yorkshire,en,92679,en92679,1033205746,"[1931 Dogger Bank earthquake, 1970 United King...","[{'lang': 'zh', '*': '南约克郡'}, {'lang': 'zh-min...",False
8569916,English language,en,8569916,en8569916,1038381785,"[11th century, 15th century, 17th century, 5th...","[{'lang': 'zh-yue', '*': '英文'}, {'lang': 'zu',...",False
2488140,Craigroyston F.C.,en,2488140,en2488140,988076292,"[2008–09 East of Scotland Football League, 200...",,False


In [53]:
temp_data.loc[temp_data['title'] == 'Royston']

Unnamed: 0,title,lang,pageid,uniqueid,lastrevid,links,langlinks,ego
694263,Royston,en,694263,en694263,1002398725,"[Craigroyston Community High School, Craigroys...","[{'lang': 'ceb', '*': 'Royston'}, {'lang': 'de...",False


In [5]:
# Kubernetes, Vaccine, Oxford, [Verkhovna_Rada this one seems to have a bug - fix later; see also 'Consumer_protection'] 
# wiki_data_kubs = downloadWikiNetwork('Kubernetes')
# wiki_data_Verkhovna_Rada = downloadWikiNetwork('Verkhovna_Rada')

In [32]:
len(network_data_df.loc[network_data_df['title'] == 'Vaccine']['links'].values.tolist()[0])

609

In [25]:
# all_nodes

In [26]:
for item in wiki_data['query']['pages']: 
    if '2646623' in item['query']['pages'].keys():
       if 'links' in item['query']['pages']['2646623'].keys(): 
           print(item)

TypeError: list indices must be integers or slices, not str