In [0]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, Comment
import requests
import sqlite3
import json
import os
import urllib
import re

Note: https://pypi.org/project/find-job-titles/ didn't work

In [0]:
with open('names.json', 'r') as f:
    names = json.load(f)

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

occupations = {}

names = names[:100]
for counter, e in enumerate(names):
    title = e[0]

    PARAMS = {
      "action": "query",
      "format": "json",
      "titles": title,
      "prop": "description"
    }

    response = S.get(url=URL, params=PARAMS).json()
    page = next(iter(response['query']['pages'].values()))
    try:
        occupations[title] = page['description']
    except:
        occupations[title] = None

In [0]:
print(len(occupations.values()))
unique_occupations = set(occupations.values())
print(len(unique_occupations))

100
62


In [0]:
# load pretraiend word2vec matrix (glove)
# download from https://nlp.stanford.edu/projects/glove/
# glove.6B.zip trained on wikipedia
# https://stackoverflow.com/questions/47666699/using-word2vec-to-classify-words-in-categories
w2v = {}
with open('glove.6B/glove.6B.100d.txt') as f: # dim = 100
    for line in f:
        values = line.split()
        word = values[0]
        embed = np.array(values[1:], dtype=np.float32)
        w2v[word] = embed
print('Loaded %s word vectors.' % len(w2v))

Loaded 400000 word vectors.


In [0]:
def cosine_similarlity(v1, v2):
    return v1.dot(v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def get_score(emb):
    if emb is None: return -1
    s1 = max(cosine_similarlity(emb, w2v['nationality']), 
             cosine_similarlity(emb, w2v['country']))
    s2 = max(cosine_similarlity(emb, w2v['job']), 
             cosine_similarlity(emb, w2v['occupation']),
             cosine_similarlity(emb, w2v['career']))
    return s2-s1

def remove_irrelevant(tokens):
    embs = [w2v[token] if token in w2v else None  # None as a place holder for unknown words
            for token in tokens]
    scores = np.array([get_score(emb) for emb in embs])
    
    # standardize scores
    min_val = np.min(scores)
    max_val = np.max(scores)
    scores = (scores - min_val) / (max_val - min_val)
#     print(scores)

    indices = scores > 0.5
    return [tokens[i] for i in range(len(tokens)) if indices[i]]
    
def get_occupations(des):
    if not des: return None
    tokens = des.lower().replace('-', ' ').replace('and ', '').split()
    return remove_irrelevant(tokens)

for des in unique_occupations:
    occupations = get_occupations(des)
    print(des, ' -->', occupations)

Tibetan Buddhist teacher and lama  --> ['teacher']
Bangladeshi physicist  --> ['physicist']
Tongan politician  --> ['politician']
Rinpoche  --> []
Olympic judoka  --> ['olympic']
American convicted drug trafficker  --> ['convicted', 'drug']
young adult fiction writer  --> ['fiction', 'writer']
American music producer  --> ['music', 'producer']
Italian contemporary artist  --> ['artist']
French disc jockey  --> ['disc', 'jockey']
Japanese adult video (AV) director  --> ['japanese', 'adult', 'video', 'director']
association football player  --> ['football', 'player']
Bhutanese spiritual leader  --> ['bhutanese', 'spiritual']
Baritone saxophone player based in New York City  --> ['baritone', 'saxophone', 'player', 'york']
Lesotho politician  --> ['politician']
Swiss journalist  --> ['journalist']
spiritual leader of Tibet  --> ['spiritual', 'tibet']
American rapper  --> ['rapper']
professional wrestler  --> ['professional']
United States rapper  --> ['rapper']
Lesotho distance runner  -->

