In [1]:
# Setup packages and file paths 
import requests
import pandas as pd

path = "/home/teijehidde/Documents/Git Blog and Coding/data_dump/"
data_file = "data_new2.json"

In [2]:
def downloadWikiNetwork(node_title, lang = "en"): # , additional_langs = ["ar" "de", "fr", "nl"]): 

    api_endpoint = "https://" + lang + ".wikipedia.org/w/api.php" # fr.wikipedia.org; https://en.wikipedia.org
    wiki_data = []
    print("Starting download of network " + node_title + " in language " + lang + ".")
    
    # 1: download data on the node_title wikipage.
    S = requests.Session()
    params_node_title = {
        "action": "query",
        "titles": node_title,
        "prop": "info|links|langlinks", 
        "plnamespace": 0, 
        "pllimit": 500,
        "lllimit": 500, 
        "format": "json"
    }
    response = S.get(url=api_endpoint, params=params_node_title)
    wiki_data.append(response.json())
    
    while 'continue' in wiki_data[-1].keys():
        params_cont = params_node_title
        if 'plcontinue' in wiki_data[-1]['continue']:
            params_cont["plcontinue"] = wiki_data[-1]['continue']['plcontinue'] 
            print('plcontinue: ' + params_cont["plcontinue"])

        elif 'llcontinue' in wiki_data[-1]['continue']:
            params_cont["llcontinue"] = wiki_data[-1]['continue']['llcontinue'] 
            print('llcontinue: ' + params_cont["llcontinue"])

        response = S.get(url=api_endpoint, params=params_cont)
        wiki_data.append(response.json())

    # 2: download data on the links of node_title wikipage.
    S = requests.Session()
    params_network_title = {
        "action": "query",
        "generator": "links",
        "titles": node_title,
        "gplnamespace": 0, 
        "gpllimit": 500, 
        "plnamespace": 0,
        "pllimit": 500, 
        "prop": "info|links", 
        "format": "json"
    }
    response = S.get(url=api_endpoint, params=params_network_title)
    wiki_data.append(response.json())

    while 'continue' in wiki_data[-1].keys():
        params_cont = params_network_title
        if 'plcontinue' in wiki_data[-1]['continue']:
            params_cont["plcontinue"] = wiki_data[-1]['continue']['plcontinue'] 
            print('plcontinue: ' + params_cont["plcontinue"])

        elif 'gplcontinue' in wiki_data[-1]['continue']: 
            params_cont["gplcontinue"] = wiki_data[-1]['continue']['gplcontinue']
            print('gplcontinue: ' + params_cont["gplcontinue"])

        response = S.get(url=api_endpoint, params = params_cont)
        wiki_data.append(response.json())

    # 3: creating list of available nodes in wiki_data. 
    all_nodes = []

    for item in wiki_data:
        all_nodes = all_nodes + list(item['query']['pages'].keys())
    all_nodes = list(set(all_nodes))
    all_nodes = [int(i) for i in all_nodes if int(i) > 0]
    all_nodes = [str(i) for i in all_nodes]
    
    network_data_df = pd.DataFrame(
        columns = ['title', 'lang', 'pageid', 'uniqueid', 'lastrevid', 'links', 'langlinks'], # complete list: ['ns', 'title', 'missing', 'contentmodel', 'pagelanguage', 'pagelanguagehtmlcode', 'pageid', 'lastrevid', 'length', 'links', 'langlinks']
        index = all_nodes)

    # 4: Using all_nodes to go through list of raw data from API. 
    for node in all_nodes:
                
        for item in wiki_data:
            if node in item['query']['pages'].keys(): # possibility:  df_new_wiki_data.update(item) #, errors = 'raise') 
                network_data_df.at[node, 'title'] = item['query']['pages'][node]['title']
                network_data_df.at[node,'lang'] = item['query']['pages'][node]['pagelanguage']
                network_data_df.at[node,'pageid'] = item['query']['pages'][node]['pageid']
                network_data_df.at[node,'uniqueid'] = network_data_df.at[node,'lang'] + str(network_data_df.at[node,'pageid'])
                network_data_df.at[node,'lastrevid'] = item['query']['pages'][node]['lastrevid']

                if 'links' in item['query']['pages'][node].keys():
                    links_temp = []
                    for link in item['query']['pages'][node]['links']:
                        links_temp.append(link['title'])
                    network_data_df.at[node,'links'] = links_temp

                if 'langlinks' in item['query']['pages'][node].keys():
                    network_data_df.at[node,'langlinks'] = item['query']['pages'][node]['langlinks']

    # returns panda with all data from network. 
    return network_data_df


In [7]:
# Function: complete download of single title in multiple languages. - saves data in json file. 
def downloadMultiLangWikiNetwork(node_title, original_lang = 'en', additional_langs = ["ar", "de", "fr", "nl"]): # or: 'available_langs'
    network_data_df = downloadWikiNetwork(node_title=node_title, lang=original_lang)
    try: 
        available_langs = network_data_df.loc[network_data_df['langlinks'].notnull()]['langlinks'].values.tolist()[0]
    except: 
        print(network_data_df.loc[network_data_df['title'] == node_title].loc[network_data_df['lang'] == original_lang]['langlinks'].values.tolist())
        pass

    if additional_langs == []:
        print('The wikipedia page is available in the following languages:')         
        print(available_langs)
    
    else:
        for item in available_langs: 
            if item['lang'] in additional_langs:
                network_data_df_additional = downloadWikiNetwork(node_title = item['*'], lang = item['lang'])
                network_data_df = pd.concat([network_data_df, network_data_df_additional], ignore_index=True, sort=False)
                
    network_data_saved = pd.read_json((path + data_file), orient='split')
    network_data_df = pd.concat([network_data_df, network_data_saved], ignore_index=True, sort=False)
    network_data_df = network_data_df.loc[network_data_df.astype(str).drop_duplicates(keep = 'last').index].reset_index(drop=True)
    network_data_df.to_json((path + data_file), orient='split')
    print("Download of network and additional languages finished.") 


In [4]:
def overviewWikiNetworks(): 
    network_data_df = pd.read_json((path + data_file), orient='split')

    available_wiki_networks = network_data_df.loc[network_data_df['langlinks'].notnull()]
    available_wiki_networks['number_of_links'] = available_wiki_networks['links'].apply(lambda x: len(x))

    available_topics = network_data_df.loc[network_data_df['langlinks'].notnull()].loc[network_data_df['lang'] == 'en'][['title','langlinks']]
    
    # available_langs  = set(network_data_df['lang'])
    # overview_df = pd.DataFrame(
    # columns = available_langs, 
    # index = available_topics)

    # available_wiki_networks[['title', 'lang', 'number_of_links']],.pivot(index = 'title', columns= 'lang', values = 'number_of_links')

    return available_topics
    

In [5]:
# Kubernetes, Vaccine, Oxford, [Verkhovna_Rada this one seems to have a bug - fix later; see also 'Consumer_protection'] 
# wiki_data_kubs = downloadWikiNetwork('Kubernetes')
# wiki_data_Verkhovna_Rada = downloadWikiNetwork('Verkhovna_Rada')

In [8]:
downloadMultiLangWikiNetwork('Verkhovna_Rada')

Starting download of network Verkhovna_Rada in language en.
plcontinue: 844993|0|Slovenian_Parliament
plcontinue: 1216|0|Cradle_of_civilization
plcontinue: 1216|0|Jerusalem
plcontinue: 1216|0|Peace_and_Friendship_Stadium
plcontinue: 1216|0|Ünye
plcontinue: 3708|0|Flandersnews.be
plcontinue: 3708|0|Podgorica
plcontinue: 3768|0|Cortes_Generales
plcontinue: 5166|0|Australian_rules_football
plcontinue: 5166|0|Glostrup_Municipality
plcontinue: 5166|0|Radisson_Blu_Royal_Hotel
plcontinue: 9581|0|Committee_on_Legal_Affairs
plcontinue: 9581|0|Pan-African_Parliament
plcontinue: 12521|0|Firmenich
plcontinue: 12521|0|Secondary_sector_of_the_economy
plcontinue: 13964|0|Impeachment
plcontinue: 13964|0|Separation_of_powers
plcontinue: 19004|0|Humid_continental_climate
plcontinue: 19004|0|Reykjavík
plcontinue: 23298|0|Elections_in_Moldova
plcontinue: 24406|0|Casimir_IV_the_Jagiellonian
plcontinue: 24406|0|Telangana_Legislative_Assembly
plcontinue: 25391|0|European_Russia
plcontinue: 25391|0|Little_Big

UnboundLocalError: local variable 'available_langs' referenced before assignment

In [27]:
test = 'Here a test string'


In [28]:
test.lower()

'here a test string'

In [494]:
network_data_df = pd.read_json((path + data_file), orient='split')

available_topics = network_data_df.loc[network_data_df['langlinks'].notnull()].loc[network_data_df['lang'] == 'en']['title'].values.tolist()
available_langs  = list(set(network_data_df['lang']))


overview_df = pd.DataFrame(
columns = available_langs, 
index = available_topics)

for topic in available_topics: 
    for lang in available_langs: 
        try: 
            overview_df.at[topic,lang] = available_wiki_networks.loc[available_wiki_networks['title'] == topic].loc[available_wiki_networks['lang'] == lang]['number_of_links'].astype(int)
        except: 
            overview_df.at[topic,lang] = 'NA'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  available_wiki_networks['number_of_links'] = available_wiki_networks['links'].apply(lambda x: len(x))


In [496]:
available_wiki_networks

Unnamed: 0,title,lang,pageid,uniqueid,lastrevid,links,langlinks,number_of_links
278,Vaccine,en,32653,en32653,1037068962,"[Serum Institute of India, Sex steroid, Sheep ...","[{'lang': 'af', '*': 'Entstof'}, {'lang': 'ak'...",109
634,لقاح,ar,597715,ar597715,54578135,"[Ad26.COV2.S, Ad5-nCOV, BBV152, BRW (magazine)...","[{'lang': 'af', '*': 'Entstof'}, {'lang': 'ak'...",335
1021,Impfstoff,de,31096,de31096,214543850,"[AIDS, AZD1222, Ad26.COV2.S, Ad26.Mos.HIV, Ade...","[{'lang': 'af', '*': 'Entstof'}, {'lang': 'ak'...",383
1363,Vaccin,fr,58748,fr58748,185191650,"[Sécurisexe, Sécurité alimentaire, Sécurité de...","[{'lang': 'af', '*': 'Entstof'}, {'lang': 'ak'...",133
1947,Vaccin,nl,8836,nl8836,59682468,"[1768, 18e eeuw, 430 v.Chr., Actieve immunisat...","[{'lang': 'af', '*': 'Entstof'}, {'lang': 'ak'...",113
2224,Secularism,en,27113,en27113,1035172106,"[969 Movement, A Letter Concerning Toleration,...","[{'lang': 'af', '*': 'Sekularisme'}, {'lang': ...",333
2667,علمانية,ar,3013,ar3013,54577832,"[1632, 1704, 2007, Brazilian Association of At...","[{'lang': 'af', '*': 'Sekularisme'}, {'lang': ...",471
2730,Säkularismus,de,10318,de10318,207002765,"[Antiklerikalismus, Christentum, Edward Royle,...","[{'lang': 'af', '*': 'Sekularisme'}, {'lang': ...",35
3328,Laïcité,fr,8534,fr8534,185010629,"[Ordre de préséance, Orthodoxes vieux-croyants...","[{'lang': 'af', '*': 'Sekularisme'}, {'lang': ...",214
3460,Secularisme,nl,102067,nl102067,59677268,"[Antisacerdotalisme, Christendom, De Verlichti...","[{'lang': 'af', '*': 'Sekularisme'}, {'lang': ...",30


In [482]:
overview_df.at['Vaccine','de'] = 'test'

In [483]:
overview_df

Unnamed: 0,fr,en,de,nl,ar,"(Vaccine, de)"
Vaccine,,,test,,,test
Secularism,,,,,,test
Kubernetes,,,,,,test


In [464]:
available_wiki_networks

Unnamed: 0,title,lang,pageid,uniqueid,lastrevid,links,langlinks,number_of_links
278,Vaccine,en,32653,en32653,1037068962,"[Serum Institute of India, Sex steroid, Sheep ...","[{'lang': 'af', '*': 'Entstof'}, {'lang': 'ak'...",109
634,لقاح,ar,597715,ar597715,54578135,"[Ad26.COV2.S, Ad5-nCOV, BBV152, BRW (magazine)...","[{'lang': 'af', '*': 'Entstof'}, {'lang': 'ak'...",335
1021,Impfstoff,de,31096,de31096,214543850,"[AIDS, AZD1222, Ad26.COV2.S, Ad26.Mos.HIV, Ade...","[{'lang': 'af', '*': 'Entstof'}, {'lang': 'ak'...",383
1363,Vaccin,fr,58748,fr58748,185191650,"[Sécurisexe, Sécurité alimentaire, Sécurité de...","[{'lang': 'af', '*': 'Entstof'}, {'lang': 'ak'...",133
1947,Vaccin,nl,8836,nl8836,59682468,"[1768, 18e eeuw, 430 v.Chr., Actieve immunisat...","[{'lang': 'af', '*': 'Entstof'}, {'lang': 'ak'...",113
2224,Secularism,en,27113,en27113,1035172106,"[969 Movement, A Letter Concerning Toleration,...","[{'lang': 'af', '*': 'Sekularisme'}, {'lang': ...",333
2667,علمانية,ar,3013,ar3013,54577832,"[1632, 1704, 2007, Brazilian Association of At...","[{'lang': 'af', '*': 'Sekularisme'}, {'lang': ...",471
2730,Säkularismus,de,10318,de10318,207002765,"[Antiklerikalismus, Christentum, Edward Royle,...","[{'lang': 'af', '*': 'Sekularisme'}, {'lang': ...",35
3328,Laïcité,fr,8534,fr8534,185010629,"[Ordre de préséance, Orthodoxes vieux-croyants...","[{'lang': 'af', '*': 'Sekularisme'}, {'lang': ...",214
3460,Secularisme,nl,102067,nl102067,59677268,"[Antisacerdotalisme, Christendom, De Verlichti...","[{'lang': 'af', '*': 'Sekularisme'}, {'lang': ...",30


In [446]:
for topic in available_topics.values.tolist():
    langs_topic_pd = network_data_df.loc[network_data_df['title'] == topic].loc[network_data_df['lang'] == 'en']['langlinks'].values.tolist()[0]

    try: 
        for lang in available_langs:
            localized_title = [i['*'] for i in langs_topic_pd if i['lang'] == lang][0]
            overview_df[topic,lang] = len(network_data_df.loc[network_data_df['title'] == localized_title].loc[network_data_df['lang'] == lang]['links'].values.tolist()[0]) 
    except:
        overview_df[topic,lang] = 'nan'

    # langs_topic_pd

    #  try: 
    #        overview_df.at[topic,lang] = len(network_data_df.loc[network_data_df['title'] == langs_pd.loc[langs_pd['lang'] == lang]['*']].loc[network_data_df['lang'] == lang]['links'].values.tolist()[0])
    #    except:
    #        overview_df.at[topic,lang] = 'None'

In [442]:
available_topics.values.tolist()


['Vaccine', 'Secularism', 'Kubernetes']

In [325]:
langs_pd.loc[langs_pd['lang'] == 'fr']['*']


27    Laïcité
Name: *, dtype: object

In [15]:
# test.iloc[10:80,]