In [47]:
import numpy as np
import pandas as pd

from itertools import chain
from collections import defaultdict

In [49]:
df_title = pd.read_csv('../data/cb12/processed/df_title_MinorGroup200_tokenized.csv', sep=';')
print(df_title.shape)
print("Number of unique JobTitles (raw): ", len(df_title.JobTitle.unique()))
print("Number of unique tokenized JobTitles: ", len(df_title.JobTitle_token.unique()))
print("Number of unique Users: ", len(df_title.UserId.unique()))

(18088, 8)
Number of unique JobTitles (raw):  9219
Number of unique tokenized JobTitles:  9216
Number of unique Users:  4907


In [14]:
vocab_lemma_filtered = set(chain(chain.from_iterable([eval(x) for x in df_title['JobTitle_token_Id']])))
print('Vocab size: {}'.format(len(vocab_lemma_filtered)))

Vocab size: 1682


# Step1: Create file

### titles.csv

In [50]:
Ids = []
JobTitle_tokens = []
JobTitle_tokens_idx = []
MajorGroups = []
MinorGroups = []
DetailedOccupations = []

Id = 0
for idx, row in df_title.iterrows():
    if row["JobTitle_token"] not in JobTitle_tokens:
        Ids.append(Id)
        JobTitle_tokens.append(row["JobTitle_token"])
        JobTitle_tokens_idx.append(row["JobTitle_token_Id"])
        MajorGroups.append(row["MajorGroup"])
        MinorGroups.append(row["MinorGroup"])
        DetailedOccupations.append(row["DetailedOccupation"])
        Id+=1

In [52]:
titles = pd.DataFrame({"Id": Ids, 
                       "MajorGroup": MajorGroups, 
                       "MinorGroup": MinorGroups, 
                       "DetailedOccupation": DetailedOccupations, 
                       "JobTitle_tokens": JobTitle_tokens,
                       "JobTitle_tokens_idx": JobTitle_tokens_idx})
print(titles.shape)
titles.to_csv('../data/cb12/graph/titles.csv', sep='\t')

(9216, 6)


In [53]:
dict_JobTitle_tokens2Id = dict(zip(JobTitle_tokens, Ids))

### title_title_transition.csv
* transition
* add self-loop

In [54]:
Weight = defaultdict(int)

for filename, rows in df_title.groupby('UserId'):
    titles_thisresume = rows['JobTitle_token'].tolist()
    for i in range(len(titles_thisresume)):
        if i < len(titles_thisresume)-1:
            src = dict_JobTitle_tokens2Id[titles_thisresume[i]]
            dst = dict_JobTitle_tokens2Id[titles_thisresume[i+1]]
            Weight[(src,dst)] += 1


print(len(Weight))
print(np.sum(list(Weight.values())))
print(len([v for k, v in Weight.items() if k[0]==k[1]])/ len(Weight))


for idx, row in titles.iterrows():
    job = row["Id"]
    if (job,job) not in Weight:
        Weight[(job,job)] += 1
                                  
    
print(len(Weight))
print(np.sum(list(Weight.values())))

Src = [edge[0] for edge in Weight.keys()]
Dst = [edge[1] for edge in Weight.keys()]

11996
13181
0.04768256085361787
20640
21825


In [55]:
title_title_transition = pd.DataFrame({"Src": Src, "Dst": Dst, "Weight": list(Weight.values())})
title_title_transition.to_csv('../data/cb12/graph/title_title_transition_MinorGroup200.csv', sep='\t', index=False)

### title_title_transition_enhanced.csv
* if two titles have the same tag, there is a bi-directional edge between them
* add self-loop

In [56]:
f_in = open('../data/cb12/processed/Tags_MinorGroup200_200.txt', 'r')
tags = f_in.readlines()
tags = [t.strip('\n') for t in tags]
f_in.close()

In [57]:
Weight = defaultdict(int)
for filename, rows in df_title.groupby('UserId'):
    titles_thisresume = rows['JobTitle_token'].tolist()
    for i in range(len(titles_thisresume)):
        if i < len(titles_thisresume)-1:
            src = dict_JobTitle_tokens2Id[titles_thisresume[i]]
            dst = dict_JobTitle_tokens2Id[titles_thisresume[i+1]]
            Weight[(src,dst)] += 1


print(len(Weight))
print(np.sum(list(Weight.values())))
print(len([v for k, v in Weight.items() if k[0]==k[1]])/ len(Weight))

list_JobTitle_tokens = dict_JobTitle_tokens2Id.keys()
for tag in tags:
    list_JobTitle_hastag = [title for title in list_JobTitle_tokens if tag in title.split()]
    for i in range(len(list_JobTitle_hastag)):
        for j in range(len(list_JobTitle_hastag)):
            src = dict_JobTitle_tokens2Id[list_JobTitle_hastag[i]]
            dst = dict_JobTitle_tokens2Id[list_JobTitle_hastag[j]]
            Weight[(src,dst)] += 1

    
print(len(Weight))
print(np.sum(list(Weight.values())))


for idx, row in titles.iterrows():
    job = row["Id"]
    if (job,job) not in Weight:
        Weight[(job,job)] += 1
                                  
    
print(len(Weight))
print(np.sum(list(Weight.values())))


Src = [edge[0] for edge in Weight.keys()]
Dst = [edge[1] for edge in Weight.keys()]

11996
13181
0.04768256085361787
6477442
7133880
6477819
7134257


In [58]:
title_title_transition = pd.DataFrame({"Src": Src, "Dst": Dst, "Weight": list(Weight.values())})
title_title_transition.to_csv('../data/cb12/graph/title_title_transition_MinorGroup200_enhanced.csv', sep='\t', index=False)

### id_Tag

In [59]:
dict_Tag2id = dict()

with open('../data/cb12/graph/id_tag.txt', 'w') as f:
    for idx, tag in enumerate(tags):
        f.write(str(idx) + '\t' + 'Tag_' + tag)
        f.write('\n')
        dict_Tag2id[tag] = idx
f.close()

### id_title

In [60]:
dict_title2id = dict()
with open('../data/cb12/graph/id_title.txt', 'w') as f:
    i = 0
    for idx, title in enumerate(df_title.JobTitle_token.unique()):
        title = '_'.join(title.split())
        f.write(str(idx) + '\t' + 'T_' + title)
        f.write('\n')
        dict_title2id[title] = i
        i+=1
f.close()
print('Number of title: {}'.format(len(dict_title2id)))

Number of title: 9216


### title_feature

In [61]:
dict_title2feature = dict()
with open('../data/cb12/graph/title_feature.txt', 'w') as f:
    for idx, row in df_title.iterrows():
        title = '_'.join(row['JobTitle_token'].split())
        feature = row['JobTitle_token_Id']
        if title not in dict_title2feature:
            f.write(str(dict_title2id[title]) + '\t' + feature)
            f.write('\n')
            dict_title2feature[title] = feature
f.close()

### title_label

In [62]:
dict_title2label = dict()
with open('../data/cb12/graph/title_label.txt', 'w') as f:
    for idx, row in df_title.iterrows():
        title = '_'.join(row['JobTitle_token'].split())
        if title not in dict_title2label:
            MinorGroup = row['MinorGroup']
            MajorGroup = row['MajorGroup'] 
            f.write(str(dict_title2id[title]) + '\t' + str(MinorGroup) + '\t' + str(MajorGroup))
            f.write('\n')
            dict_title2label[title] = MinorGroup
f.close()


### title_Tag

In [30]:
def find_tag(title, tags):
    result = []
    for tag in tags:
        if tag in title.split():
            title_ = '_'.join(title.split())
            result.append(str(dict_title2id[title_]) + '\t' + str(dict_Tag2id[tag]))
            #result.append(str(title_) + '\t' + str(tag))
    return result

In [31]:
title_tag = []
for title in list(df_title.JobTitle_token.unique()):
    title_tag.append(find_tag(title, tags))

In [32]:
with open('../data/cb12/graph/title_tag.txt', 'w') as f:
    for row in title_tag:
        for item in row:
            t,r = item.split()
            f.write(t + '\t' + r)
            f.write('\n')
f.close()

In [28]:
#title_tag = list(chain(chain.from_iterable(title_tag)))