In [1]:
import numpy as np
import pickle
import json
import seaborn as sns
from nltk.corpus import stopwords
from nltk import word_tokenize, RegexpTokenizer
from pyfasttext import FastText
from collections import defaultdict, Counter
import string
from bidict import bidict
from statistics import mean

In [2]:
model = FastText("/data/rali7/Tmp/solimanz/data/wikipedia/wiki.en.bin")

In [3]:
with open("/data/rali7/Tmp/solimanz/data/datasets/top550/jobid/data.json", "rb") as f:
    data550 = json.load(f)
with open("/data/rali7/Tmp/solimanz/data/datasets/reduced7000/jobid/data.json", "rb") as f:
    data7k = json.load(f)

In [13]:
with open("/part/01/Tmp/solimanz/data/skill_lists.json", "r") as f:
    skills = f.read()
skills = skills.replace("}", "},")
skills = skills[:-2] + '\n'
skills = "[" + skills + "]"
skills = json.loads(skills)

In [50]:
tokenizer = RegexpTokenizer(r'\s+|\W', gaps=True)

In [40]:
id_skills = {doc['id']: [skill.lower() for skill in doc['skills']] for doc in skills}

In [43]:
pickle.dump(id_skills, open('/data/rali7/Tmp/solimanz/data/pickles/skills.pkl', 'wb'))

In [6]:
id_skills = pickle.load(open('/data/rali7/Tmp/solimanz/data/pickles/skills.pkl', 'rb'))

In [4]:
ids550 = [p[0] for p in data550['train_data'] + data550['test_data']]
ids7k = [p[0] for p in data7k['train_data'] + data7k['test_data']]

In [22]:
id_skills550 = dict()
for id_ in ids550:
    id_skills550[id_] = list(set(id_skills[id_]))

In [7]:
id_skills7k = dict()
for id_ in ids7k:
    id_skills7k[id_] = list(set(id_skills[id_]))

In [8]:
def get_token_ids(id_skills):
    """ Returns dict that associates every token to an id"""
    sw = set(stopwords.words('english') + ['...'])
    tokenizer = RegexpTokenizer(r'\s+|\W', gaps=True) 
    skills = []    
    for s in id_skills.values():
        skills += s
    
    tokens = list(set(tokenizer.tokenize(" ".join(skills))))
    tokens = [t for t in tokens if t not in string.punctuation and t not in sw]
    
    return {token: id_ for id_, token in enumerate(tokens)}

In [9]:
def get_skill_tokens(id_skills):
    
    tokenizer = RegexpTokenizer(r'\s+|\W', gaps=True)    
    for key, skills in id_skills.items():
        tokens = tokenizer.tokenize(" ".join(skills))
        id_skills[key] = tuple(set(tokens))
        
    return id_skills

In [10]:
token_id = get_token_ids(id_skills7k)

In [11]:
len(token_id)

52379

In [12]:
sorted(token_id.values())[-1]

52378

In [13]:
token_id['<NULL>'] = len(token_id)

In [14]:
id_skills7k = get_skill_tokens(id_skills7k)

In [15]:
skills_embs = np.zeros((len(token_id), 300), dtype=np.float32)

In [16]:
for skill in token_id.keys():
    if skill == '<NULL>':
        continue
    emb = model.get_numpy_vector(skill)
    skills_embs[token_id[skill]] = emb

In [63]:
skills_embs = np.vstack((skills_embs, np.zeros((300), dtype=np.float32)))

In [17]:
skills_embs.shape

(52380, 300)

In [18]:
np.save("/data/rali7/Tmp/solimanz/data/datasets/reduced7000/skill_embs/skill_tokens.npy", skills_embs)

In [19]:
id_sid = {id_: [token_id[tok] for tok in skills if tok in token_id] for id_, skills in id_skills7k.items()}

In [58]:
for k, v in id_sid.items():
    if not v:
        id_sid[k].append('<NULL>')

In [59]:
train_data = data550['train_data']
test_data = data550['test_data']

In [64]:
for dat in train_data:
    id_ = dat[0]
    dat.append(id_sid[id_])

In [66]:
for dat in test_data:
    id_ = dat[0]
    dat.append(id_sid[id_])

In [103]:
data550['train_data'] = train_data
data550['test_data'] = test_data
data550['skill_token_id'] = token_id

In [70]:
data = train_data + test_data
data550['max_skills_num'] = max([len(d[-1]) for d in data])

In [79]:
data550['skills_embs'] = "/data/rali7/Tmp/solimanz/data/datasets/top550/skill_embs/skill_tokens.npy"

In [104]:
with open('/data/rali7/Tmp/solimanz/data/datasets/top550/skill_embs/data.json', 'w') as f:
    json.dump(data550, f)

In [88]:
for dat in test_data:
    if dat[2][0] == '<NULL>':
        dat[2][0] = token_id['<NULL>']

In [93]:
data550.keys()

dict_keys(['title_to_id', 'train_data', 'test_data', 'maximum_seq_len', 'skill_token_id', 'max_skills_num', 'skills_embs'])

In [94]:
data550['max_skills_num']

207

In [100]:
for dat in train_data:
    if len(dat[2]) < data550['max_skills_num']:
        for i in range(data550['max_skills_num'] - len(dat[2])):
            dat[2].append(token_id['<NULL>'])

In [102]:
np.unique([len(dat[2]) for dat in train_data])

array([207])

# Only Focus on the Most Common Skills

In [3]:
with open('/data/rali7/Tmp/solimanz/data/datasets/top550/skill_embs/data.json', 'r') as f:
    data550 = json.load(f)

In [10]:
skill_tok = bidict(data550['skill_token_id'])

In [14]:
skill_tok['<NULL>']

20209

In [31]:
all_data = data550['train_data'] + data550['test_data']
train = data550['train_data']
test = data550['test_data']

In [15]:
id_sid = dict()

for data in all_data:
    id_ = data[0]
    skills = data[2]
    id_sid[id_] = []
    for skill in skills:
        if skill != skill_tok['<NULL>']:
            id_sid[id_].append(skill)

In [20]:
all_skills = []

for skill_set in id_sid.values():
    all_skills += skill_set

In [21]:
skill_tok = bidict(token_id)

In [22]:
all_skills = [skill_tok.inv[i] for i in all_skills]

In [23]:
skill_counts = Counter(all_skills)

In [24]:
for id_, skills in id_sid.items():
    sorted_skills = sorted(skills, key=lambda s: skill_counts[skill_tok.inv[s]], reverse=True)
    id_sid[id_] = sorted_skills

In [25]:
def pad(arr, max_size):
    pad_length = max_size - len(arr)
    for i in range(pad_length):
        arr.append(skill_tok['<NULL>'])
    return arr

In [26]:
train_data = data7k['train_data']
test_data = data7k['test_data']

In [28]:
for p in train_data:
    id_ = p[0] 
    if id_sid[id_]:
        top_skills = id_sid[id_][:10]
        if len(top_skills) < 10:
            top_skills = pad(top_skills, 10)
        p.append(top_skills)
    else:
        p.append([skill_tok['<NULL>']] * 10)

In [29]:
for p in test_data:
    id_ = p[0] 
    if id_sid[id_]:
        top_skills = id_sid[id_][:10]
        if len(top_skills) < 10:
            top_skills = pad(top_skills, 10)
        p.append(top_skills)
    else:
        p.append([skill_tok['<NULL>']] * 10)

In [108]:
skill_tok.inv[49378]
194815

'analysis'

In [109]:
skill_counts['analysis']

131733

In [30]:
data7k['train_data'] = train_data
data7k['test_data'] = test_data
data7k['max_skills_num'] = 10
data7k['skill_token_id'] = token_id
data7k['skills_embs'] = "/data/rali7/Tmp/solimanz/data/datasets/reduced7000/skill_embs/skill_tokens.npy"

In [31]:
with open('/data/rali7/Tmp/solimanz/data/datasets/reduced7000/skill_embs/data.json', 'w') as f:
    json.dump(data7k, f)

# Add Skills to Multilabel Datasets

In [32]:
with open('/data/rali7/Tmp/solimanz/data/datasets/multilabel/top550/data.json', 'r') as f:
    ml_550 = json.load(f)
with open('/data/rali7/Tmp/solimanz/data/datasets/multilabel/reduced7k/data.json', 'r') as f:
    ml_7k = json.load(f)

In [33]:
with open('/data/rali7/Tmp/solimanz/data/datasets/top550/skill_embs/data.json', 'r') as f:
    data550 = json.load(f)
with open('/data/rali7/Tmp/solimanz/data/datasets/reduced7000/skill_embs/data.json', 'r') as f:
    data7k = json.load(f)    

In [34]:
def add_skills(ml_data, data):
    
    id_skills = {d[0]: d[2] for d in data}
    for dat in ml_data:
        dat.append(id_skills[dat[0]])

In [35]:
add_skills(ml_550['train_data'], data550['train_data'])
add_skills(ml_550['test_data'], data550['test_data'])

[['55ef65cc0b0451c87b93a1af',
  [8, 8],
  [6339, 12450, 10165, 13054, 6009, 11067, 17036, 16450, 14478, 13409]],
 ['551d0e6f0b04514d5d8b4cae',
  [16, 16, 16, 498, 16],
  [6339, 12450, 10165, 14556, 9465, 8643, 12139, 5997, 519, 5010]],
 ['55e846350b04517f1d8f0bbe',
  [527, 527, 527],
  [6339, 4720, 17520, 18920, 11767, 739, 2078, 831, 5010, 5974]],
 ['55df1ab40b0451dc5c8b806f',
  [61, 89],
  [6339, 1672, 4720, 14556, 13054, 14066, 8406, 16450, 15411, 1862]],
 ['53a93f690b0451b10c8b4bbc',
  [18, 18],
  [20209, 20209, 20209, 20209, 20209, 20209, 20209, 20209, 20209, 20209]],
 ['55e6ac3a0b04514a0b8b6cc4',
  [5, 12],
  [17036, 3349, 5529, 7654, 13661, 4382, 15060, 2682, 597, 11045]],
 ['55ef49690b0451c87b927eff',
  [5, 26],
  [7906, 3258, 17036, 8418, 14613, 17593, 11056, 5664, 7841, 10575]],
 ['55e9c1170b0451ab2d8d1e51',
  [60, 508],
  [6339, 12450, 10165, 11067, 13661, 15208, 12300, 3440, 13953, 20209]],
 ['5331462d0b045163418b464f',
  [0, 545],
  [20209, 20209, 20209, 20209, 20209, 2020

In [36]:
add_skills(ml_7k['train_data'], data7k['train_data'])
add_skills(ml_7k['test_data'], data7k['test_data'])

[['55e893d80b04517f1d93aac1',
  [40, 181, 181],
  [33536, 31043, 40739, 52095, 41909, 28467, 46984, 8709, 38698, 25663]],
 ['55ea14990b0451ab2d923099',
  [1460, 1460],
  [50626, 4003, 3886, 25338, 22897, 17378, 22348, 46921, 21936, 51332]],
 ['579618469d15a96a358c3dd7',
  [77, 329],
  [45131, 37780, 35967, 19537, 4070, 51283, 47404, 40251, 27106, 8518]],
 ['55f045150b0451c87b9b8716',
  [4558, 185],
  [33536, 7313, 15313, 19331, 52379, 52379, 52379, 52379, 52379, 52379]],
 ['55ebc8de0b0451ab2da9654b',
  [4234, 3092, 258],
  [33536, 9056, 23392, 49589, 4395, 34095, 38698, 5206, 25663, 13249]],
 ['55e737eb0b04514a0b93c7a8',
  [42, 10, 10, 10, 1435, 1435],
  [31043, 40739, 13171, 25079, 11728, 4519, 32474, 45636, 4471, 16626]],
 ['55eb3b520b0451ab2da1c493',
  [1, 1],
  [33536, 46191, 42184, 651, 14414, 24253, 45220, 5184, 29588, 19306]],
 ['55eeeecc0b0451c87b8e8efa',
  [13, 314, 314, 2361],
  [21156, 52095, 45131, 5674, 36756, 50700, 20145, 46072, 32122, 43955]],
 ['55f158120b0451c87ba502d

In [157]:
all_dat7k = data7k['train_data'] + data7k['test_data']
all_ml7k = ml_7k['train_data'] + ml_7k['test_data']

In [47]:
data550.keys()

dict_keys(['title_to_id', 'train_data', 'test_data', 'maximum_seq_len', 'skill_token_id', 'max_skills_num', 'skills_embs'])

In [49]:
ml_550['skill_token_id'] = data550['skill_token_id']
ml_550['max_skills_num'] = data550['max_skills_num']
ml_550['skills_embs'] = data550['skills_embs']

In [50]:
ml_7k['skill_token_id'] = data7k['skill_token_id']
ml_7k['max_skills_num'] = data7k['max_skills_num']
ml_7k['skills_embs'] = data7k['skills_embs']

In [51]:
with open('/data/rali7/Tmp/solimanz/data/datasets/multilabel/top550/data2.json', 'w') as f:
    json.dump(ml_550, f)
with open('/data/rali7/Tmp/solimanz/data/datasets/multilabel/reduced7k/data2.json', 'w') as f:
    json.dump(ml_7k, f)

In [40]:
len(ml_550['train_data'][0][2])

10

In [41]:
with open('/data/rali7/Tmp/solimanz/data/datasets/multilabel/top550/data2.json', 'r') as f:
    a = json.load(f)
with open('/data/rali7/Tmp/solimanz/data/datasets/multilabel/reduced7k/data2.json', 'r') as f:
    b = json.load(f)

In [42]:
a['train_data'][0]

['55ebe1fa0b0451ab2da9f137',
 [420, 420],
 [20209, 20209, 20209, 20209, 20209, 20209, 20209, 20209, 20209, 20209]]

In [46]:
len(a['label_id'])

326

In [44]:
b.keys()

dict_keys(['title_to_id', 'label_id', 'train_data', 'train_targets', 'test_data', 'test_targets', 'maximum_seq_len', 'n_labels'])