# Extracting Occupation and Employer data from Wikidata
This notebook extracts data from the Wikidata dump. It takes a long time to run and isn't very instructive, but it *does* allow the data gathering portion of this experiment to be easily repeated.
To run this notebook:
* first, download the wikidata json.bz2 dump
* second, download the [Kensho Wikimedia derived dataset](https://www.kaggle.com/kenshoresearch/kensho-derived-wikimedia-data)

In [1]:
import bz2
import json
import os
import pickle
from tqdm import tqdm
import pandas as pd
# ! pip install Wikidata
from wikidata.client import Client
from random import sample
from collections import Counter

In [2]:
! ls -lh /home/todd/projects/data/wikidata/

total 48G
-rw-rw-r-- 1 todd todd 48G Mar 23 07:39 latest-all.json.bz2


In [4]:
def wikidata(filename):
    with bz2.open(filename, mode='rt') as f:
        f.read(2) # skip first two bytes: "{\n"
        for line in f:
            try:
                yield json.loads(line.rstrip(',\n'))
            except json.decoder.JSONDecodeError:
                continue

In [5]:
occ_cnt = Counter()
emp_cnt = Counter()
pos_cnt = Counter()

def check_data (obj):
    try:
        if 'P106' in obj.get('claims',[]):   
            for item in obj['claims']['P106']:
                key =  item.get('mainsnak',{}).get('datavalue', {}).get('value',{}).get('id')
                if key:
                    occ_cnt.update({ key : 1 })          
        if 'P101' in obj.get('claims',[]):   
            for item in obj['claims']['P101']:
                key =  item.get('mainsnak',{}).get('datavalue', {}).get('value',{}).get('id')
                if key:
                    pos_cnt.update({ key:1 })          
        if 'P108' in obj.get('claims',[]):   
            for item in obj['claims']['P108']:
                key =  item.get('mainsnak',{}).get('datavalue', {}).get('value',{}).get('id')
                if key:
                    pos_cnt.update({ key : 1 })          
    except Exception:
        pass

In [6]:
for record in tqdm(wikidata(os.path.expanduser('~/projects/data/wikidata/latest-all.json.bz2')):
    check_data(record)

78951018it [7:03:58, 3103.65it/s] 


In [7]:
len(occ_cnt), len(pos_cnt), len(emp_cnt)

(12754, 66885, 0)

In [8]:
occ_cnt.most_common(10)

[('Q82955', 606008),
 ('Q1650915', 488772),
 ('Q937857', 259012),
 ('Q36180', 245025),
 ('Q33999', 220615),
 ('Q1028181', 145927),
 ('Q1930187', 117705),
 ('Q1622272', 101818),
 ('Q177220', 92198),
 ('Q49757', 79887)]

In [9]:
with open('occupations.counts.wikidata.2020.03.22.pkl', 'wb') as fout:
    pickle.dump(occ_cnt, fout)

In [12]:
with open('positions.counts.wikidata.2020.03.22.pkl', 'wb') as fout:
    pickle.dump(pos_cnt, fout)

In [13]:
occ_qcodes_counts ={ key[1:]: value for key, value in occ_cnt.items()}

pos_qcode_counts = {key[1:]: value for key, value in pos_cnt.items()}

In [17]:
keys, cnts = zip(*occ_qcodes_counts.items())

occ_df = pd.DataFrame({ "occupation_items": keys, "occupation_counts": cnts  }  )
occ_df.head()

Unnamed: 0,occupation_items,occupation_counts
0,82955,606008
1,189290,33481
2,131512,8887
3,1734662,2012
4,294126,323


In [24]:
occ_df.dtypes

occupation_items     object
occupation_counts     int64
dtype: object

In [27]:
occ_df = occ_df.astype({'occupation_items': 'int32'})
occ_df.dtypes

occupation_items     int32
occupation_counts    int64
dtype: object

In [18]:
items_df =  pd.read_csv('/home/todd/projects/data/kensho_wikidata/tmp/item.csv')
items_df.head()

Unnamed: 0,item_id,en_label,en_description
0,1,Universe,totality of space and all contents
1,2,Earth,third planet from the Sun in the Solar System
2,3,life,matter capable of extracting energy from the e...
3,4,death,permanent cessation of vital functions
4,5,human,"common name of Homo sapiens, unique extant spe..."


In [25]:
items_df.dtypes

item_id            int64
en_label          object
en_description    object
dtype: object

In [28]:
new_occs = pd.merge(occ_df, items_df, left_on='occupation_items', right_on='item_id')
new_occs.head()

Unnamed: 0,occupation_items,occupation_counts,item_id,en_label,en_description
0,82955,606008,82955,politician,"person involved in politics, person who holds ..."
1,189290,33481,189290,military officer,member of an armed force or uniformed service ...
2,131512,8887,131512,farmer,person that works in agriculture
3,1734662,2012,1734662,cartographer,person preparing geographical maps
4,294126,323,294126,land surveyor,profession


In [50]:
new_occs  = new_occs.rename(columns={'occupation_items': 'occupation_item_id'})
new_occs.head()

Unnamed: 0,occupation_item_id,occupation_counts,item_id,en_label,en_description
0,82955,606008,82955,politician,"person involved in politics, person who holds ..."
1,189290,33481,189290,military officer,member of an armed force or uniformed service ...
2,131512,8887,131512,farmer,person that works in agriculture
3,1734662,2012,1734662,cartographer,person preparing geographical maps
4,294126,323,294126,land surveyor,profession


In [51]:
len(new_occs)

11430

In [30]:
keys, cnts = zip(*pos_qcode_counts.items())

pos_df = pd.DataFrame({ "position_item_id": keys, "position_counts": cnts  }  )
pos_df.head()

Unnamed: 0,position_item_id,position_counts
0,2944031,9
1,214126,53
2,37230,493
3,9531,1520
4,134995,57


In [31]:
pos_df = pos_df.astype({'position_item_id': 'int64'})
pos_df.dtypes

position_item_id    int64
position_counts     int64
dtype: object

In [32]:
new_pos = pd.merge(pos_df, items_df, left_on='position_item_id', right_on='item_id')
new_pos.head()

Unnamed: 0,position_item_id,position_counts,item_id,en_label,en_description
0,2944031,9,2944031,Counter Terrorist Unit,fictional branch of the CIA from the televisio...
1,214126,53,214126,Los Angeles Police Department,municipal police
2,37230,493,37230,Central Intelligence Agency,national intelligence agency of the United States
3,9531,1520,9531,BBC,British public service broadcaster
4,134995,57,134995,bibliography,


In [35]:
new_pos.to_csv('wikidata.employers.csv', index=False, sep='\t')

In [34]:
new_pos.iloc[1110]

position_item_id                                               645663
position_counts                                                   745
item_id                                                        645663
en_label                                           University of Pisa
en_description      Italian public research university located in ...
Name: 1110, dtype: object

In [52]:
new_occs.to_csv('wikidata.occupations.csv', index=False, sep='\t')

In [38]:
len(new_pos)

64733

In [53]:
new_occs.sample(n=5)

Unnamed: 0,occupation_item_id,occupation_counts,item_id,en_label,en_description
9947,1537376,1,1537376,grave robbery,act of uncovering a tomb or crypt to steal art...
4603,193036,1,193036,acrobatics,Performance of extraordinary human feats of ba...
10971,45176174,1,45176174,,
512,2732142,2390,2732142,statistician,person who works with theoretical or applied s...
8805,452961,2,452961,script,document describing the narrative and dialogue...


In [40]:
new_pos.sample(n=5)

Unnamed: 0,position_item_id,position_counts,item_id,en_label,en_description
53577,30279877,1,30279877,Ostschweizer Kinderspital,"healthcare organization in St. Gallen, Switzer..."
15104,3254948,2,3254948,Diario de Cádiz,"Spanish-language newspaper published in Cádiz,..."
34893,6971380,9,6971380,National Center for Toxicological Research,
38378,11622605,1,11622605,Ranshadō,private school in Yokohama city
36004,9014504,1,9014504,Polaris Media,


In [42]:
len(new_pos), len(new_occs)

(64733, 11430)

In [43]:
len(occ_qcodes_counts), len(pos_qcode_counts)

(12754, 66885)

## find the missing item_ids

In [65]:
all_occ_item_ids = {int(key) for key in occ_qcodes_counts}
len(all_occ_item_ids)

occ_item_id_from_df = new_occs['occupation_item_id'].tolist()

len(occ_item_id_from_df)

missing_occs  = set(all_occ_item_ids) - set(occ_item_id_from_df)
len(missing_occs)

1324

In [103]:
all_emp_item_ids = { int(key) for key in pos_qcode_counts}
emp_item_id_from_df = new_pos['position_item_id'].tolist()
missing_emps = set(all_emp_item_ids) - set(emp_item_id_from_df)
len(missing_emps)

2152

# Fetch missing occupations and employers from wikidata

In [102]:
client = Client()   

In [90]:
results =[]
for item_id in tqdm(missing_occs, total=len(missing_occs)):
    try:
        entity = client.get(f"Q{item_id}", load=True)
        cnt = occ_qcodes_counts.get(str(item_id ),0)
        results.append( (item_id, str(entity.label) , str(entity.description), cnt))
    except Exception as ex:
        print("fail ", str(ex))
    



  0%|          | 0/1324 [00:00<?, ?it/s][A[A

  0%|          | 1/1324 [00:03<1:14:58,  3.40s/it][A[A

  0%|          | 2/1324 [00:06<1:12:13,  3.28s/it][A[A

  0%|          | 3/1324 [00:09<1:12:09,  3.28s/it][A[A

  0%|          | 4/1324 [00:12<1:12:19,  3.29s/it][A[A

  0%|          | 5/1324 [00:16<1:11:08,  3.24s/it][A[A

  0%|          | 6/1324 [00:19<1:11:27,  3.25s/it][A[A

  1%|          | 7/1324 [00:23<1:14:02,  3.37s/it][A[A

  1%|          | 8/1324 [00:26<1:13:34,  3.35s/it][A[A

  1%|          | 9/1324 [00:29<1:11:42,  3.27s/it][A[A

  1%|          | 10/1324 [00:32<1:12:07,  3.29s/it][A[A

  1%|          | 11/1324 [00:36<1:13:16,  3.35s/it][A[A

  1%|          | 12/1324 [00:39<1:12:45,  3.33s/it][A[A

  1%|          | 13/1324 [00:42<1:12:33,  3.32s/it][A[A

  1%|          | 14/1324 [00:46<1:12:20,  3.31s/it][A[A

  1%|          | 15/1324 [00:49<1:12:13,  3.31s/it][A[A

  1%|          | 16/1324 [00:53<1:14:44,  3.43s/it][A[A

  1%|▏         

 21%|██        | 275/1324 [17:05<56:34,  3.24s/it][A[A

 21%|██        | 276/1324 [17:08<56:53,  3.26s/it][A[A

 21%|██        | 277/1324 [17:11<55:26,  3.18s/it][A[A

 21%|██        | 278/1324 [17:15<55:53,  3.21s/it][A[A

 21%|██        | 279/1324 [17:18<59:41,  3.43s/it][A[A

 21%|██        | 280/1324 [17:21<53:58,  3.10s/it][A[A

 21%|██        | 281/1324 [17:24<54:51,  3.16s/it][A[A

 21%|██▏       | 282/1324 [17:27<55:23,  3.19s/it][A[A

 21%|██▏       | 283/1324 [17:31<55:47,  3.22s/it][A[A

 21%|██▏       | 284/1324 [17:34<56:12,  3.24s/it][A[A

 22%|██▏       | 285/1324 [17:37<55:00,  3.18s/it][A[A

 22%|██▏       | 286/1324 [17:40<55:27,  3.21s/it][A[A

 22%|██▏       | 287/1324 [17:45<1:01:52,  3.58s/it][A[A

 22%|██▏       | 288/1324 [17:47<52:58,  3.07s/it]  [A[A

 22%|██▏       | 289/1324 [17:50<54:04,  3.13s/it][A[A

 22%|██▏       | 290/1324 [17:53<54:57,  3.19s/it][A[A

 22%|██▏       | 291/1324 [17:56<55:19,  3.21s/it][A[A

 22%|██▏  

 42%|████▏     | 556/1324 [33:16<44:42,  3.49s/it][A[A

 42%|████▏     | 557/1324 [33:20<43:58,  3.44s/it][A[A

 42%|████▏     | 558/1324 [33:24<49:43,  3.90s/it][A[A

 42%|████▏     | 559/1324 [33:28<47:17,  3.71s/it][A[A

 42%|████▏     | 560/1324 [33:31<45:42,  3.59s/it][A[A

 42%|████▏     | 561/1324 [33:35<47:48,  3.76s/it][A[A

 42%|████▏     | 562/1324 [33:37<42:01,  3.31s/it][A[A

 43%|████▎     | 563/1324 [33:41<41:53,  3.30s/it][A[A

 43%|████▎     | 564/1324 [33:45<47:02,  3.71s/it][A[A

 43%|████▎     | 565/1324 [33:47<39:14,  3.10s/it][A[A

 43%|████▎     | 566/1324 [33:50<39:58,  3.16s/it][A[A

 43%|████▎     | 567/1324 [33:54<40:25,  3.20s/it][A[A

 43%|████▎     | 568/1324 [33:57<40:39,  3.23s/it][A[A

 43%|████▎     | 569/1324 [34:02<46:44,  3.71s/it][A[A

 43%|████▎     | 570/1324 [34:07<52:56,  4.21s/it][A[A

 43%|████▎     | 571/1324 [34:11<49:24,  3.94s/it][A[A

 43%|████▎     | 572/1324 [34:15<52:23,  4.18s/it][A[A

 43%|████▎    

 53%|█████▎    | 697/1324 [41:55<34:25,  3.29s/it][A[A

 53%|█████▎    | 698/1324 [41:58<34:23,  3.30s/it][A[A

 53%|█████▎    | 699/1324 [42:02<36:40,  3.52s/it][A[A

 53%|█████▎    | 700/1324 [42:08<41:25,  3.98s/it][A[A

 53%|█████▎    | 701/1324 [42:09<32:54,  3.17s/it][A[A

 53%|█████▎    | 702/1324 [42:12<33:15,  3.21s/it][A[A

 53%|█████▎    | 703/1324 [42:15<33:25,  3.23s/it][A[A

 53%|█████▎    | 704/1324 [42:19<33:29,  3.24s/it][A[A

 53%|█████▎    | 705/1324 [42:28<53:50,  5.22s/it][A[A

 53%|█████▎    | 706/1324 [42:32<47:51,  4.65s/it][A[A

 53%|█████▎    | 707/1324 [42:35<43:03,  4.19s/it][A[A

 53%|█████▎    | 708/1324 [42:38<40:12,  3.92s/it][A[A

 54%|█████▎    | 709/1324 [42:41<38:11,  3.73s/it][A[A

 54%|█████▎    | 710/1324 [42:50<54:18,  5.31s/it][A[A

 54%|█████▎    | 711/1324 [42:54<47:58,  4.70s/it][A[A

 54%|█████▍    | 712/1324 [42:57<44:21,  4.35s/it][A[A

 54%|█████▍    | 713/1324 [43:01<41:04,  4.03s/it][A[A

 54%|█████▍   

 74%|███████▍  | 979/1324 [59:21<18:52,  3.28s/it][A[A

 74%|███████▍  | 980/1324 [59:25<18:53,  3.29s/it][A[A

 74%|███████▍  | 981/1324 [59:28<18:48,  3.29s/it][A[A

 74%|███████▍  | 982/1324 [59:31<19:13,  3.37s/it][A[A

 74%|███████▍  | 983/1324 [59:35<19:02,  3.35s/it][A[A

 74%|███████▍  | 984/1324 [59:38<18:58,  3.35s/it][A[A

 74%|███████▍  | 985/1324 [59:45<25:25,  4.50s/it][A[A

 74%|███████▍  | 986/1324 [59:49<23:18,  4.14s/it][A[A

 75%|███████▍  | 987/1324 [59:52<21:50,  3.89s/it][A[A

 75%|███████▍  | 988/1324 [59:55<21:15,  3.80s/it][A[A

 75%|███████▍  | 989/1324 [59:59<20:20,  3.64s/it][A[A

 75%|███████▍  | 990/1324 [1:00:02<19:49,  3.56s/it][A[A

 75%|███████▍  | 991/1324 [1:00:06<20:19,  3.66s/it][A[A

 75%|███████▍  | 992/1324 [1:00:09<19:46,  3.57s/it][A[A

 75%|███████▌  | 993/1324 [1:00:13<19:18,  3.50s/it][A[A

 75%|███████▌  | 994/1324 [1:00:16<18:35,  3.38s/it][A[A

 75%|███████▌  | 995/1324 [1:00:19<18:23,  3.35s/it][A[A

 7

 94%|█████████▍| 1248/1324 [1:15:25<04:19,  3.42s/it][A[A

 94%|█████████▍| 1249/1324 [1:15:29<04:13,  3.38s/it][A[A

 94%|█████████▍| 1250/1324 [1:15:32<04:16,  3.47s/it][A[A

 94%|█████████▍| 1251/1324 [1:15:36<04:23,  3.61s/it][A[A

 95%|█████████▍| 1252/1324 [1:15:40<04:13,  3.52s/it][A[A

 95%|█████████▍| 1253/1324 [1:15:43<04:05,  3.45s/it][A[A

 95%|█████████▍| 1254/1324 [1:15:46<03:58,  3.40s/it][A[A

 95%|█████████▍| 1255/1324 [1:15:50<04:13,  3.67s/it][A[A

 95%|█████████▍| 1256/1324 [1:15:54<04:03,  3.59s/it][A[A

 95%|█████████▍| 1257/1324 [1:15:57<03:54,  3.49s/it][A[A

 95%|█████████▌| 1258/1324 [1:16:01<03:49,  3.48s/it][A[A

 95%|█████████▌| 1259/1324 [1:16:04<03:36,  3.33s/it][A[A

 95%|█████████▌| 1260/1324 [1:16:07<03:32,  3.33s/it][A[A

 95%|█████████▌| 1261/1324 [1:16:11<03:37,  3.45s/it][A[A

 95%|█████████▌| 1262/1324 [1:16:14<03:41,  3.58s/it][A[A

 95%|█████████▌| 1263/1324 [1:16:17<03:16,  3.23s/it][A[A

 95%|█████████▌| 1264/13

In [94]:
ids, labels, descs, cnts= zip(*results)

newer_occs = pd.DataFrame({"occupation_item_id": ids, "occupation_counts": cnts,
                          "en_label": labels, "en_description": descs})

In [96]:
newer_occs.sample(5)

Unnamed: 0,occupation_item_id,occupation_counts,en_label,en_description
850,11293355,71,オートレース選手,
60,1171796,1,Interventionism,Wikipedia disambiguation page
1232,10550654,1,Marinejeger,norsk militär
333,5973933,7,Letrados,
274,11650597,2,録音技師,


In [98]:
all_occs = pd.concat([new_occs, newer_occs], join="inner")
all_occs.head()

Unnamed: 0,occupation_item_id,occupation_counts,en_label,en_description
0,82955,606008,politician,"person involved in politics, person who holds ..."
1,189290,33481,military officer,member of an armed force or uniformed service ...
2,131512,8887,farmer,person that works in agriculture
3,1734662,2012,cartographer,person preparing geographical maps
4,294126,323,land surveyor,profession


In [99]:
all_occs.to_csv('wikidata.occupations.csv', index=False, sep='\t')

In [100]:
tmp_occs = pd.read_csv('wikidata.occupations.csv', sep='\t')
tmp_occs.head()

Unnamed: 0,occupation_item_id,occupation_counts,en_label,en_description
0,82955,606008,politician,"person involved in politics, person who holds ..."
1,189290,33481,military officer,member of an armed force or uniformed service ...
2,131512,8887,farmer,person that works in agriculture
3,1734662,2012,cartographer,person preparing geographical maps
4,294126,323,land surveyor,profession


In [113]:
print(f"occupations in wikidata & kensho {len(new_occs):,} occupations only in wikidata {len(newer_occs):,}" )

occupations in wikidata & kensho 11,430 occupations only in wikidata 1,324


In [112]:
all_occs.to_csv('occupations.wikidata.csv', index=False, sep='\t')

In [104]:
results =[]
for item_id in tqdm(missing_emps, total=len(missing_emps)):
    try:
        entity = client.get(f"Q{item_id}", load=True)
        cnt = pos_qcode_counts.get(str(item_id ),0)
        results.append( (item_id, str(entity.label) , str(entity.description), cnt))
    except Exception as ex:
        print("fail ", str(ex))



  0%|          | 0/2152 [00:00<?, ?it/s][A[A

  0%|          | 1/2152 [00:03<2:01:19,  3.38s/it][A[A

  0%|          | 2/2152 [00:06<2:00:29,  3.36s/it][A[A

  0%|          | 3/2152 [00:10<2:09:08,  3.61s/it][A[A

  0%|          | 4/2152 [00:13<1:58:49,  3.32s/it][A[A

  0%|          | 5/2152 [00:20<2:39:48,  4.47s/it][A[A

  0%|          | 6/2152 [00:28<3:19:28,  5.58s/it][A[A

  0%|          | 7/2152 [00:32<2:54:46,  4.89s/it][A[A

  0%|          | 8/2152 [00:35<2:37:32,  4.41s/it][A[A

  0%|          | 9/2152 [00:38<2:25:36,  4.08s/it][A[A

  0%|          | 10/2152 [00:41<2:16:33,  3.83s/it][A[A

  1%|          | 11/2152 [00:46<2:26:55,  4.12s/it][A[A

  1%|          | 12/2152 [00:48<1:59:48,  3.36s/it][A[A

  1%|          | 13/2152 [00:51<2:00:05,  3.37s/it][A[A

  1%|          | 14/2152 [00:56<2:13:23,  3.74s/it][A[A

  1%|          | 15/2152 [00:58<1:59:57,  3.37s/it][A[A

  1%|          | 16/2152 [01:09<3:21:31,  5.66s/it][A[A

  1%|          

 13%|█▎        | 274/2152 [17:51<1:52:52,  3.61s/it][A[A

 13%|█▎        | 275/2152 [17:55<1:50:11,  3.52s/it][A[A

 13%|█▎        | 276/2152 [17:58<1:48:10,  3.46s/it][A[A

 13%|█▎        | 277/2152 [18:02<1:50:26,  3.53s/it][A[A

 13%|█▎        | 278/2152 [18:05<1:49:10,  3.50s/it][A[A

 13%|█▎        | 279/2152 [18:09<1:49:39,  3.51s/it][A[A

 13%|█▎        | 280/2152 [18:12<1:51:24,  3.57s/it][A[A

 13%|█▎        | 281/2152 [18:16<1:51:03,  3.56s/it][A[A

 13%|█▎        | 282/2152 [18:20<1:57:56,  3.78s/it][A[A

 13%|█▎        | 283/2152 [18:23<1:50:34,  3.55s/it][A[A

 13%|█▎        | 284/2152 [18:26<1:48:09,  3.47s/it][A[A

 13%|█▎        | 285/2152 [18:30<1:47:06,  3.44s/it][A[A

 13%|█▎        | 286/2152 [18:33<1:45:56,  3.41s/it][A[A

 13%|█▎        | 287/2152 [18:36<1:44:23,  3.36s/it][A[A

 13%|█▎        | 288/2152 [18:40<1:49:32,  3.53s/it][A[A

 13%|█▎        | 289/2152 [18:47<2:22:10,  4.58s/it][A[A

 13%|█▎        | 290/2152 [18:51<2:10:08

fail  HTTP Error 404: Not Found




 19%|█▉        | 415/2152 [27:57<1:54:09,  3.94s/it][A[A

 19%|█▉        | 416/2152 [28:00<1:48:57,  3.77s/it][A[A

 19%|█▉        | 417/2152 [28:03<1:44:45,  3.62s/it][A[A

 19%|█▉        | 418/2152 [28:10<2:06:42,  4.38s/it][A[A

 19%|█▉        | 419/2152 [28:13<1:57:17,  4.06s/it][A[A

 20%|█▉        | 420/2152 [28:16<1:50:33,  3.83s/it][A[A

 20%|█▉        | 421/2152 [28:19<1:45:49,  3.67s/it][A[A

 20%|█▉        | 422/2152 [28:22<1:39:58,  3.47s/it][A[A

 20%|█▉        | 423/2152 [28:26<1:38:38,  3.42s/it][A[A

 20%|█▉        | 424/2152 [28:31<1:50:20,  3.83s/it][A[A

 20%|█▉        | 425/2152 [28:34<1:46:14,  3.69s/it][A[A

 20%|█▉        | 426/2152 [28:37<1:42:33,  3.56s/it][A[A

 20%|█▉        | 427/2152 [28:41<1:40:21,  3.49s/it][A[A

 20%|█▉        | 428/2152 [28:44<1:38:37,  3.43s/it][A[A

 20%|█▉        | 429/2152 [28:47<1:37:27,  3.39s/it][A[A

 20%|█▉        | 430/2152 [28:50<1:36:26,  3.36s/it][A[A

 20%|██        | 431/2152 [28:54<1:35:

fail  HTTP Error 404: Not Found




 20%|██        | 433/2152 [29:00<1:35:30,  3.33s/it][A[A

 20%|██        | 434/2152 [29:04<1:35:17,  3.33s/it][A[A

 20%|██        | 435/2152 [29:12<2:16:28,  4.77s/it][A[A

 20%|██        | 436/2152 [29:13<1:46:27,  3.72s/it][A[A

 20%|██        | 437/2152 [29:16<1:42:09,  3.57s/it][A[A

 20%|██        | 438/2152 [29:20<1:40:02,  3.50s/it][A[A

 20%|██        | 439/2152 [29:23<1:38:32,  3.45s/it][A[A

 20%|██        | 440/2152 [29:26<1:37:46,  3.43s/it][A[A

 20%|██        | 441/2152 [29:35<2:25:41,  5.11s/it][A[A

 21%|██        | 442/2152 [29:39<2:09:53,  4.56s/it][A[A

 21%|██        | 443/2152 [29:42<1:59:08,  4.18s/it][A[A

 21%|██        | 444/2152 [29:45<1:51:35,  3.92s/it][A[A

 21%|██        | 445/2152 [29:49<1:46:14,  3.73s/it][A[A

 21%|██        | 446/2152 [29:52<1:42:42,  3.61s/it][A[A

 21%|██        | 447/2152 [29:55<1:39:58,  3.52s/it][A[A

 21%|██        | 448/2152 [30:00<1:47:36,  3.79s/it][A[A

 21%|██        | 449/2152 [30:03<1:43:

 33%|███▎      | 705/2152 [47:01<1:22:19,  3.41s/it][A[A

fail  HTTP Error 404: Not Found




 33%|███▎      | 706/2152 [47:17<2:47:19,  6.94s/it][A[A

 33%|███▎      | 707/2152 [47:20<2:20:57,  5.85s/it][A[A

 33%|███▎      | 708/2152 [47:24<2:05:12,  5.20s/it][A[A

 33%|███▎      | 709/2152 [47:30<2:10:02,  5.41s/it][A[A

 33%|███▎      | 710/2152 [47:31<1:44:08,  4.33s/it][A[A

 33%|███▎      | 711/2152 [47:35<1:36:32,  4.02s/it][A[A

 33%|███▎      | 712/2152 [47:38<1:34:09,  3.92s/it][A[A

 33%|███▎      | 713/2152 [47:42<1:29:40,  3.74s/it][A[A

 33%|███▎      | 714/2152 [47:46<1:36:32,  4.03s/it][A[A

 33%|███▎      | 715/2152 [47:49<1:23:43,  3.50s/it][A[A

 33%|███▎      | 716/2152 [47:52<1:22:02,  3.43s/it][A[A

 33%|███▎      | 717/2152 [47:56<1:25:59,  3.60s/it][A[A

 33%|███▎      | 718/2152 [47:59<1:25:23,  3.57s/it][A[A

 33%|███▎      | 719/2152 [48:03<1:23:28,  3.49s/it][A[A

 33%|███▎      | 720/2152 [48:06<1:22:12,  3.44s/it][A[A

 34%|███▎      | 721/2152 [48:15<1:58:44,  4.98s/it][A[A

 34%|███▎      | 722/2152 [48:19<1:52:

 45%|████▌     | 975/2152 [1:06:20<1:09:56,  3.56s/it][A[A

 45%|████▌     | 976/2152 [1:06:23<1:02:49,  3.21s/it][A[A

 45%|████▌     | 977/2152 [1:06:30<1:29:09,  4.55s/it][A[A

 45%|████▌     | 978/2152 [1:06:34<1:22:08,  4.20s/it][A[A

 45%|████▌     | 979/2152 [1:06:37<1:15:07,  3.84s/it][A[A

 46%|████▌     | 980/2152 [1:06:40<1:11:52,  3.68s/it][A[A

 46%|████▌     | 981/2152 [1:06:45<1:18:35,  4.03s/it][A[A

 46%|████▌     | 982/2152 [1:06:48<1:11:25,  3.66s/it][A[A

 46%|████▌     | 983/2152 [1:06:51<1:09:28,  3.57s/it][A[A

 46%|████▌     | 984/2152 [1:06:54<1:08:08,  3.50s/it][A[A

 46%|████▌     | 985/2152 [1:06:58<1:06:51,  3.44s/it][A[A

 46%|████▌     | 986/2152 [1:07:01<1:05:42,  3.38s/it][A[A

 46%|████▌     | 987/2152 [1:07:15<2:10:21,  6.71s/it][A[A

 46%|████▌     | 988/2152 [1:07:20<2:00:58,  6.24s/it][A[A

 46%|████▌     | 989/2152 [1:07:22<1:32:06,  4.75s/it][A[A

 46%|████▌     | 990/2152 [1:07:26<1:27:00,  4.49s/it][A[A

 46%|███

fail  HTTP Error 404: Not Found




 54%|█████▍    | 1165/2152 [1:19:35<1:01:33,  3.74s/it][A[A

 54%|█████▍    | 1166/2152 [1:19:38<1:01:18,  3.73s/it][A[A

 54%|█████▍    | 1167/2152 [1:19:42<59:27,  3.62s/it]  [A[A

 54%|█████▍    | 1168/2152 [1:19:47<1:07:15,  4.10s/it][A[A

 54%|█████▍    | 1169/2152 [1:19:49<58:56,  3.60s/it]  [A[A

 54%|█████▍    | 1170/2152 [1:19:53<57:24,  3.51s/it][A[A

 54%|█████▍    | 1171/2152 [1:19:56<56:17,  3.44s/it][A[A

 54%|█████▍    | 1172/2152 [1:19:59<55:35,  3.40s/it][A[A

 55%|█████▍    | 1173/2152 [1:20:05<1:08:45,  4.21s/it][A[A

 55%|█████▍    | 1174/2152 [1:20:10<1:13:07,  4.49s/it][A[A

 55%|█████▍    | 1175/2152 [1:20:16<1:20:01,  4.91s/it][A[A

 55%|█████▍    | 1176/2152 [1:20:20<1:12:08,  4.44s/it][A[A

 55%|█████▍    | 1177/2152 [1:20:23<1:06:42,  4.10s/it][A[A

 55%|█████▍    | 1178/2152 [1:20:32<1:30:10,  5.56s/it][A[A

 55%|█████▍    | 1179/2152 [1:20:35<1:19:16,  4.89s/it][A[A

 55%|█████▍    | 1180/2152 [1:20:39<1:11:57,  4.44s/it][A

 66%|██████▋   | 1428/2152 [1:38:22<46:54,  3.89s/it][A[A

 66%|██████▋   | 1429/2152 [1:38:25<44:43,  3.71s/it][A[A

 66%|██████▋   | 1430/2152 [1:38:29<42:25,  3.53s/it][A[A

 66%|██████▋   | 1431/2152 [1:38:32<41:41,  3.47s/it][A[A

 67%|██████▋   | 1432/2152 [1:38:35<40:33,  3.38s/it][A[A

 67%|██████▋   | 1433/2152 [1:38:39<42:43,  3.57s/it][A[A

 67%|██████▋   | 1434/2152 [1:38:42<41:43,  3.49s/it][A[A

 67%|██████▋   | 1435/2152 [1:38:47<46:49,  3.92s/it][A[A

 67%|██████▋   | 1436/2152 [1:38:49<37:58,  3.18s/it][A[A

 67%|██████▋   | 1437/2152 [1:38:56<51:45,  4.34s/it][A[A

 67%|██████▋   | 1438/2152 [1:38:59<47:57,  4.03s/it][A[A

 67%|██████▋   | 1439/2152 [1:39:02<45:11,  3.80s/it][A[A

 67%|██████▋   | 1440/2152 [1:39:06<43:18,  3.65s/it][A[A

 67%|██████▋   | 1441/2152 [1:39:09<42:08,  3.56s/it][A[A

 67%|██████▋   | 1442/2152 [1:39:12<41:07,  3.48s/it][A[A

 67%|██████▋   | 1443/2152 [1:39:29<1:29:00,  7.53s/it][A[A

 67%|██████▋   | 1444/

 79%|███████▊  | 1694/2152 [1:57:54<32:16,  4.23s/it][A[A

 79%|███████▉  | 1695/2152 [1:57:58<30:10,  3.96s/it][A[A

 79%|███████▉  | 1696/2152 [1:58:01<28:40,  3.77s/it][A[A

 79%|███████▉  | 1697/2152 [1:58:13<47:13,  6.23s/it][A[A

 79%|███████▉  | 1698/2152 [1:58:18<44:36,  5.90s/it][A[A

 79%|███████▉  | 1699/2152 [1:58:19<34:03,  4.51s/it][A[A

 79%|███████▉  | 1700/2152 [1:58:22<30:42,  4.08s/it][A[A

 79%|███████▉  | 1701/2152 [1:58:31<39:57,  5.32s/it][A[A

 79%|███████▉  | 1702/2152 [1:58:34<35:19,  4.71s/it][A[A

 79%|███████▉  | 1703/2152 [1:58:37<32:08,  4.29s/it][A[A

 79%|███████▉  | 1704/2152 [1:58:41<29:52,  4.00s/it][A[A

 79%|███████▉  | 1705/2152 [1:58:44<28:16,  3.79s/it][A[A

 79%|███████▉  | 1706/2152 [1:58:47<27:06,  3.65s/it][A[A

 79%|███████▉  | 1707/2152 [1:58:51<26:23,  3.56s/it][A[A

 79%|███████▉  | 1708/2152 [1:58:54<25:49,  3.49s/it][A[A

 79%|███████▉  | 1709/2152 [1:58:58<26:34,  3.60s/it][A[A

 79%|███████▉  | 1710/21

fail  HTTP Error 404: Not Found




 89%|████████▉ | 1916/2152 [2:13:11<14:57,  3.81s/it][A[A

 89%|████████▉ | 1917/2152 [2:13:20<21:18,  5.44s/it][A[A

 89%|████████▉ | 1918/2152 [2:13:23<18:40,  4.79s/it][A[A

 89%|████████▉ | 1919/2152 [2:13:26<16:52,  4.34s/it][A[A

 89%|████████▉ | 1920/2152 [2:13:30<16:11,  4.19s/it][A[A

 89%|████████▉ | 1921/2152 [2:13:34<15:07,  3.93s/it][A[A

 89%|████████▉ | 1922/2152 [2:13:38<15:34,  4.06s/it][A[A

 89%|████████▉ | 1923/2152 [2:13:40<13:27,  3.53s/it][A[A

 89%|████████▉ | 1924/2152 [2:13:47<17:02,  4.49s/it][A[A

 89%|████████▉ | 1925/2152 [2:13:50<15:36,  4.13s/it][A[A

 89%|████████▉ | 1926/2152 [2:13:53<14:16,  3.79s/it][A[A

 90%|████████▉ | 1927/2152 [2:13:56<13:40,  3.65s/it][A[A

 90%|████████▉ | 1928/2152 [2:14:00<13:13,  3.54s/it][A[A

 90%|████████▉ | 1929/2152 [2:14:03<12:46,  3.44s/it][A[A

 90%|████████▉ | 1930/2152 [2:14:06<12:33,  3.39s/it][A[A

 90%|████████▉ | 1931/2152 [2:14:10<12:21,  3.36s/it][A[A

 90%|████████▉ | 1932/

In [105]:
ids, labels, descs, cnts= zip(*results)

newer_emps = pd.DataFrame({"employer_item_id": ids, "employer_counts": cnts,
                          "en_label": labels, "en_description": descs})
newer_emps.head()

Unnamed: 0,employer_item_id,employer_counts,en_label,en_description
0,4268034,2,лугаводства,
1,58286088,1,iT law,
2,42303499,1,Краснабярэжскі дзяржаўны аграрны каледж,
3,69812236,1,Мінскпраектмэбля,
4,84303891,1,Beaver Watershed Alliance,watershed conservation alliance in Arkansas


In [106]:
new_pos = new_pos.drop(['item_id'], axis=1)
new_emps  = new_pos.rename(columns={'position_item_id': 'employer_item_id',
                                   'position_counts': 'employer_counts'            })
new_emps.head()

Unnamed: 0,employer_item_id,employer_counts,en_label,en_description
0,2944031,9,Counter Terrorist Unit,fictional branch of the CIA from the televisio...
1,214126,53,Los Angeles Police Department,municipal police
2,37230,493,Central Intelligence Agency,national intelligence agency of the United States
3,9531,1520,BBC,British public service broadcaster
4,134995,57,bibliography,


In [109]:
print(f"employers in wikidata & kensho {len(new_emps):,} employers only in wikidata {len(newer_emps):,}" )

employers in wikidata & kensho 64,733 employers only in wikidata 2,147


In [110]:
all_emps  = pd.concat([new_emps, newer_emps], join="inner")
all_emps.head()

Unnamed: 0,employer_item_id,employer_counts,en_label,en_description
0,2944031,9,Counter Terrorist Unit,fictional branch of the CIA from the televisio...
1,214126,53,Los Angeles Police Department,municipal police
2,37230,493,Central Intelligence Agency,national intelligence agency of the United States
3,9531,1520,BBC,British public service broadcaster
4,134995,57,bibliography,


In [111]:
all_emps.to_csv('employers.wikidata.csv', index=False, sep='\t')