#### 1. Importing Libraries

In [1]:
import json
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api
import json
import time

#### 2. Reading Person List

In [None]:
person_list = []
occupation = "politician"
for i in range(1,25):
    with open('./Data/Wikidata_JSON_'+occupation+'/en_'+occupation+'_dump_part'+str(i)+'.json') as f:
        new_list = json.load(f)
        person_list = person_list + new_list

person_dict = {}
for person in person_list:
    person_dict[person['id']] = person
             
wikipage_person_dict = {}
for i in range(1,23):
    with open('./Data/English_Wikipages_'+occupation+'/en_articles_part'+str(i)+'.json') as f:
        new_dict = json.load(f)
        wikipage_person_dict.update(new_dict) 

In [4]:
len(wikipage_person_dict)

3718

#### 3. Retrieving Labels for Property Value pairs for each person

In [3]:
labelled_person_list = {}
label_list = {}
non_labelled_props = ['P727']

t0 = time.time()

index = 0

count_persons = 0

for person_id in list(wikipage_person_dict.keys()):
    
    person = person_dict[person_id]
    
    person_labelled_prop_val = {}
    list_prop_value = person['claims']
    list_properties = list(list_prop_value.keys())
    
    #Removing Properties from list of properties which dont have a wikidata page
    for prop in non_labelled_props:
        if prop in list_properties:
            list_properties.remove(prop)
    
    list_properties_copy = list_properties
    
    #Adding all properties to label list. The ones which dont have wikidata pages are stored in non_labelled_props 
    for prop in list_properties:
        if prop not in label_list.keys():
            try:
                prop_details = get_entity_dict_from_api(prop)
                prop_label = prop_details['labels']['en']['value']
                label_list[prop] = prop_label
            except:
                non_labelled_props.append(prop)
                list_properties_copy.remove(prop)
    
    list_properties = list_properties_copy
    
    #For all values per property, label is extracted for each value ID [Q##### format]
    for prop in list_properties:
        
        labelled_values = []
       
        for value in list_prop_value[prop]:
            
            #Entities which directly have a value instead of an ID for an entity are saved
            if value['mainsnak']['snaktype'] == 'value' and isinstance(value['mainsnak']['datavalue']['value'], str):
                labelled_values.append(value['mainsnak']['datavalue']['value'])
            
            #There are entities whose values are in form of Dictionary of values and not a string. These types
            #are checked here
            elif value['mainsnak']['snaktype'] == 'value' and isinstance(value['mainsnak']['datavalue']['value'], dict):
                
                #Extracting labels for Entities saved in terms of IDs
                if 'id' in value['mainsnak']['datavalue']['value'].keys():
                    
                    value_id = value['mainsnak']['datavalue']['value']['id']
                    value_label = ''
                    
                    #Extracted labels are stored in label_list for faster computation
                    if value_id not in label_list.keys():
                        
                        value_details = get_entity_dict_from_api(value_id)
                        if 'labels' in value_details.keys() and 'en' in value_details['labels'].keys():
                            value_label = value_details['labels']['en']['value']
                            label_list[value_id] = value_label

                    if value_label != '' or value_id in label_list.keys():
                        labelled_values.append(label_list[value_id])
                    
                else:
                    labelled_values.append(value['mainsnak']['datavalue']['value'])
        
        if labelled_values:
            person_labelled_prop_val[label_list[prop]] = labelled_values
    
    labelled_person_list[person['id']] = person_labelled_prop_val
    
    count_persons = count_persons + 1
    
    if count_persons%20 == 0:
        partition_num = str(count_persons/20).split('.')[0]
        with open('./Data/English_Labelled_Wikidata_businessperson/en_labelled_part'+ partition_num + '.json', 'w') as fout:
            json.dump(labelled_person_list, fout)
        
        labelled_person_list = {}
        
        print("Checkpoint %d reached, JSON dumps saved |" % (count_persons/20), end = ' ')
        print("Time Elapsed:", end = ' ')
        print(time.time()-t0)
    
t1 = time.time()
total = t1-t0

Wikidata redirect detected.  Input entity id=Q13498051. Returned entity id=Q12794688.


Checkpoint 1 reached, JSON dumps saved | Time Elapsed: 307.7796678543091
Checkpoint 2 reached, JSON dumps saved | Time Elapsed: 466.7566177845001
Checkpoint 3 reached, JSON dumps saved | Time Elapsed: 582.306314945221
Checkpoint 4 reached, JSON dumps saved | Time Elapsed: 679.1318182945251
Checkpoint 5 reached, JSON dumps saved | Time Elapsed: 795.7746908664703
Checkpoint 6 reached, JSON dumps saved | Time Elapsed: 888.7621989250183
Checkpoint 7 reached, JSON dumps saved | Time Elapsed: 952.2535276412964
Checkpoint 8 reached, JSON dumps saved | Time Elapsed: 1060.1986389160156
Checkpoint 9 reached, JSON dumps saved | Time Elapsed: 1128.6738965511322
Checkpoint 10 reached, JSON dumps saved | Time Elapsed: 1192.9545223712921


Wikidata redirect detected.  Input entity id=Q16881374. Returned entity id=Q10352355.


Checkpoint 11 reached, JSON dumps saved | Time Elapsed: 1245.737945318222
Checkpoint 12 reached, JSON dumps saved | Time Elapsed: 1295.4024240970612
Checkpoint 13 reached, JSON dumps saved | Time Elapsed: 1327.6330726146698
Checkpoint 14 reached, JSON dumps saved | Time Elapsed: 1374.6082060337067
Checkpoint 15 reached, JSON dumps saved | Time Elapsed: 1407.494785785675
Checkpoint 16 reached, JSON dumps saved | Time Elapsed: 1428.2747676372528
Checkpoint 17 reached, JSON dumps saved | Time Elapsed: 1461.8852169513702
Checkpoint 18 reached, JSON dumps saved | Time Elapsed: 1495.0765476226807
Checkpoint 19 reached, JSON dumps saved | Time Elapsed: 1522.8190710544586
Checkpoint 20 reached, JSON dumps saved | Time Elapsed: 1552.9860968589783
Checkpoint 21 reached, JSON dumps saved | Time Elapsed: 1589.3462347984314
Checkpoint 22 reached, JSON dumps saved | Time Elapsed: 1643.4500603675842
Checkpoint 23 reached, JSON dumps saved | Time Elapsed: 1682.1215651035309
Checkpoint 24 reached, JSON

Wikidata redirect detected.  Input entity id=Q21430730. Returned entity id=Q12669358.


Checkpoint 41 reached, JSON dumps saved | Time Elapsed: 2540.020952939987
Checkpoint 42 reached, JSON dumps saved | Time Elapsed: 2565.4275114536285
Checkpoint 43 reached, JSON dumps saved | Time Elapsed: 2607.0603444576263
Checkpoint 44 reached, JSON dumps saved | Time Elapsed: 2645.8500475883484
Checkpoint 45 reached, JSON dumps saved | Time Elapsed: 2684.4378321170807
Checkpoint 46 reached, JSON dumps saved | Time Elapsed: 2744.773622274399
Checkpoint 47 reached, JSON dumps saved | Time Elapsed: 2803.4618237018585
Checkpoint 48 reached, JSON dumps saved | Time Elapsed: 2846.2913761138916
Checkpoint 49 reached, JSON dumps saved | Time Elapsed: 2873.521389245987
Checkpoint 50 reached, JSON dumps saved | Time Elapsed: 2899.4368510246277
Checkpoint 51 reached, JSON dumps saved | Time Elapsed: 2926.8228220939636
Checkpoint 52 reached, JSON dumps saved | Time Elapsed: 2971.485548734665
Checkpoint 53 reached, JSON dumps saved | Time Elapsed: 3006.089823484421
Checkpoint 54 reached, JSON du

Wikidata redirect detected.  Input entity id=Q23039002. Returned entity id=Q12795967.


Checkpoint 56 reached, JSON dumps saved | Time Elapsed: 3117.559594154358
Checkpoint 57 reached, JSON dumps saved | Time Elapsed: 3143.804900407791
Checkpoint 58 reached, JSON dumps saved | Time Elapsed: 3184.8111715316772
Checkpoint 59 reached, JSON dumps saved | Time Elapsed: 3219.332836151123
Checkpoint 60 reached, JSON dumps saved | Time Elapsed: 3258.0100610256195
Checkpoint 61 reached, JSON dumps saved | Time Elapsed: 3283.8272020816803
Checkpoint 62 reached, JSON dumps saved | Time Elapsed: 3310.3819451332092
Checkpoint 63 reached, JSON dumps saved | Time Elapsed: 3333.5070214271545
Checkpoint 64 reached, JSON dumps saved | Time Elapsed: 3358.6681773662567
Checkpoint 65 reached, JSON dumps saved | Time Elapsed: 3382.217349052429
Checkpoint 66 reached, JSON dumps saved | Time Elapsed: 3401.7007727622986
Checkpoint 67 reached, JSON dumps saved | Time Elapsed: 3428.516932487488
Checkpoint 68 reached, JSON dumps saved | Time Elapsed: 3461.605919122696
Checkpoint 69 reached, JSON dum

Wikidata redirect detected.  Input entity id=Q21493525. Returned entity id=Q10390522.


Checkpoint 84 reached, JSON dumps saved | Time Elapsed: 3915.2210364341736
Checkpoint 85 reached, JSON dumps saved | Time Elapsed: 3938.032824754715
Checkpoint 86 reached, JSON dumps saved | Time Elapsed: 3967.008377790451
Checkpoint 87 reached, JSON dumps saved | Time Elapsed: 3992.8059828281403
Checkpoint 88 reached, JSON dumps saved | Time Elapsed: 4022.008956670761
Checkpoint 89 reached, JSON dumps saved | Time Elapsed: 4053.029325246811
Checkpoint 90 reached, JSON dumps saved | Time Elapsed: 4077.3499042987823
Checkpoint 91 reached, JSON dumps saved | Time Elapsed: 4111.802200078964
Checkpoint 92 reached, JSON dumps saved | Time Elapsed: 4135.4665195941925
Checkpoint 93 reached, JSON dumps saved | Time Elapsed: 4161.0262978076935
Checkpoint 94 reached, JSON dumps saved | Time Elapsed: 4181.848559379578
Checkpoint 95 reached, JSON dumps saved | Time Elapsed: 4212.783807754517
Checkpoint 96 reached, JSON dumps saved | Time Elapsed: 4238.035322666168
Checkpoint 97 reached, JSON dumps

Wikidata redirect detected.  Input entity id=Q21453734. Returned entity id=Q14590094.


Checkpoint 116 reached, JSON dumps saved | Time Elapsed: 4733.23392367363
Checkpoint 117 reached, JSON dumps saved | Time Elapsed: 4755.147875070572
Checkpoint 118 reached, JSON dumps saved | Time Elapsed: 4778.959326267242
Checkpoint 119 reached, JSON dumps saved | Time Elapsed: 4795.771826505661
Checkpoint 120 reached, JSON dumps saved | Time Elapsed: 4817.611234903336
Checkpoint 121 reached, JSON dumps saved | Time Elapsed: 4839.167044401169
Checkpoint 122 reached, JSON dumps saved | Time Elapsed: 4854.411942720413
Checkpoint 123 reached, JSON dumps saved | Time Elapsed: 4880.6273312568665
Checkpoint 124 reached, JSON dumps saved | Time Elapsed: 4899.690867185593
Checkpoint 125 reached, JSON dumps saved | Time Elapsed: 4923.692687034607
Checkpoint 126 reached, JSON dumps saved | Time Elapsed: 4941.568258047104
Checkpoint 127 reached, JSON dumps saved | Time Elapsed: 4957.810575008392
Checkpoint 128 reached, JSON dumps saved | Time Elapsed: 4980.089985132217
Checkpoint 129 reached, J

Wikidata redirect detected.  Input entity id=Q19540914. Returned entity id=Q16471351.


Checkpoint 150 reached, JSON dumps saved | Time Elapsed: 5419.919678926468
Checkpoint 151 reached, JSON dumps saved | Time Elapsed: 5440.2959225177765
Checkpoint 152 reached, JSON dumps saved | Time Elapsed: 5454.625851869583
Checkpoint 153 reached, JSON dumps saved | Time Elapsed: 5470.888478279114
Checkpoint 154 reached, JSON dumps saved | Time Elapsed: 5490.357317686081
Checkpoint 155 reached, JSON dumps saved | Time Elapsed: 5506.045521497726
Checkpoint 156 reached, JSON dumps saved | Time Elapsed: 5523.778427600861
Checkpoint 157 reached, JSON dumps saved | Time Elapsed: 5549.201649427414
Checkpoint 158 reached, JSON dumps saved | Time Elapsed: 5566.572125196457
Checkpoint 159 reached, JSON dumps saved | Time Elapsed: 5595.400729417801
Checkpoint 160 reached, JSON dumps saved | Time Elapsed: 5620.007109165192
Checkpoint 161 reached, JSON dumps saved | Time Elapsed: 5639.688066244125
Checkpoint 162 reached, JSON dumps saved | Time Elapsed: 5662.860732316971
Checkpoint 163 reached, 

#### 4. Extracting Labels for left over IDs

In [None]:
labelled_wikidata_en = {}
for i in range(1,185):
    with open('./Data/English_Labelled_Wikidata_'+occupation+'/en_labelled_part'+str(i)+'.json') as f:
        labelled_wikidata_en.update(json.load(f))

In [6]:
left_over = {}
for key in wikipage_person_dict.keys():
    if key not in labelled_wikidata_en.keys():
        left_over[key] = wikipage_person_dict[key]

In [7]:
labelled_person_list_left_over = {}
index = 0
label_list = {}
non_labelled_props = ['P727']

for person_id in left_over.keys():
    
    person = person_dict[person_id]
    
    person_labelled_prop_val = {}
    list_prop_value = person['claims']
    list_properties = list(list_prop_value.keys())
    
    #Removing Properties from list of properties which dont have a wikidata page
    for prop in non_labelled_props:
        if prop in list_properties:
            list_properties.remove(prop)
    
    list_properties_copy = list_properties
    
    #Adding all properties to label list. The ones which dont have wikidata pages are stored in non_labelled_props 
    for prop in list_properties:
        if prop not in label_list.keys():
            try:
                prop_details = get_entity_dict_from_api(prop)
                prop_label = prop_details['labels']['en']['value']
                label_list[prop] = prop_label
            except:
                non_labelled_props.append(prop)
                list_properties_copy.remove(prop)
    
    list_properties = list_properties_copy
    
    #For all values per property, label is extracted for each value ID [Q##### format]
    for prop in list_properties:
        
        labelled_values = []
       
        for value in list_prop_value[prop]:
            
            #Entities which directly have a value instead of an ID for an entity are saved
            if value['mainsnak']['snaktype'] == 'value' and isinstance(value['mainsnak']['datavalue']['value'], str):
                labelled_values.append(value['mainsnak']['datavalue']['value'])
            
            #There are entities whose values are in form of Dictionary of values and not a string. These types
            #are checked here
            elif value['mainsnak']['snaktype'] == 'value' and isinstance(value['mainsnak']['datavalue']['value'], dict):
                
                #Extracting labels for Entities saved in terms of IDs
                if 'id' in value['mainsnak']['datavalue']['value'].keys():
                    
                    value_id = value['mainsnak']['datavalue']['value']['id']
                    value_label = ''
                    
                    #Extracted labels are stored in label_list for faster computation
                    if value_id not in label_list.keys():
                        
                        value_details = get_entity_dict_from_api(value_id)
                        
                        if 'labels' in value_details.keys() and 'en' in value_details['labels'].keys():
                            value_label = value_details['labels']['en']['value']
                            label_list[value_id] = value_label

                    if value_label != '' or value_id in label_list.keys():
                        labelled_values.append(label_list[value_id])
                    
                else:
                    labelled_values.append(value['mainsnak']['datavalue']['value'])
        
        if labelled_values:
            person_labelled_prop_val[label_list[prop]] = labelled_values
    
    index+=1
    print(index)
    
    labelled_person_list_left_over[person['id']] = person_labelled_prop_val

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38


In [None]:
with open('./Data/English_Labelled_Wikidata_'+occupation+'/en_labelled_part186.json', 'w') as fout:
    json.dump(labelled_person_list_left_over, fout)