In [9]:
# load and read JSON-file
import os
dataset_path = "Exercise 2 - Named-Entity-Recognition in Resumes - Data Set.json"

with open(dataset_path,encoding="utf8") as f:
    lines = f.readlines()

In [10]:
# converting JSON encoded data into Python objects using json.loads()
import json
all_resumes = []

for line in lines:
    all_resumes.append(json.loads(line))

In [11]:
# data conversion method that creates a list of tuples(resumes) with the original 
# resume text as String and the entities as Python dictionary
def convert_data(data):
    text = data['content']
    entities = []
    if data['annotation'] is not None:
        for annotation in data['annotation']:
            point = annotation['points'][0]
            labels = annotation['label']
            if not isinstance(labels, list):
                labels = [labels]
            for label in labels:
                entities.append((point['start'], point['end'] + 1, label))
    return (text, {"entities": entities})
   
converted_resumes = [convert_data(res) for res in all_resumes]
print("Len of coverted resumes: ", len(converted_resumes))

Len of coverted resumes:  701


In [12]:
# filter out resumes without annotations
new_resumes = []
for resume in converted_resumes:
    if resume[1]["entities"] is not None:
        temp = resume[1]["entities"]
        if temp:
            new_resumes.append(resume)

converted_resumes = new_resumes
print(len(converted_resumes))

690


In [13]:
# removing all duplicates:
# In case of an unequal number of entities within the duplicates, the resume with 
# the higher number of entities is selected
def split_for_name(resume_text):
    name = ''
    for char in resume_text:
        if char == '\n':
            break
        else:
            name = name + char
    return name

updated_converted_resumes = converted_resumes[:]

for i in range(0, len(converted_resumes)-1):
    resume = converted_resumes[i]
    resume_name = split_for_name(resume[0])
    for j in range(i+1, len(converted_resumes)):
        potentialDuplicate = converted_resumes[j]
        potential_resume_name = split_for_name(potentialDuplicate[0])
        if resume_name == potential_resume_name:
            lengthRes = len(resume[1]['entities'])
            lengthPot = len(resume[1]['entities'])
            if lengthPot > lengthRes:
                updated_converted_resumes.remove(resume)
                break
            else:
                updated_converted_resumes.remove(potentialDuplicate)
                break

print(len(updated_converted_resumes))

521


In [14]:
resumes = updated_converted_resumes

# selection of three labels for our NER-task
chosen_entity_labels = ['Degree', 'Designation', 'Companies worked at']

def gather_candidates(dataset,entity_labels):
    candidates = list()
    for resume in dataset:
        res_ent_labels = list(zip(*resume[1]["entities"]))[2]
        if set(entity_labels).issubset(res_ent_labels):
            candidates.append(resume)
    return candidates

training_data = gather_candidates(resumes, chosen_entity_labels)
print("Gathered {} training examples".format(len(training_data)))

Gathered 410 training examples


In [15]:
# removing all except the chosen entities from the resumes
def filter_ents(ents, filter):
    filtered = [ent for ent in ents if ent[2] in filter]
    return filtered
 
X = []
for res in training_data:
    ents = (res[1]['entities'])
    res[1]['entities'] = filter_ents(ents, chosen_entity_labels)
    X.append(res)

In [16]:
# importing spacy for further data cleaning methods
import spacy
from spacy_train_resume_ner import train_spacy_ner

# removing bad data
def remove_bad_data(training_data):
    model, baddocs = train_spacy_ner(training_data, debug=True, n_iter=1)

    filtered = [data for data in training_data if data[0] not in baddocs]
    print("Unfiltered training data size: ",len(training_data))
    print("Filtered training data size: ", len(filtered))
    print("Bad data size: ", len(baddocs))
    return filtered

X = remove_bad_data(X)

Created blank 'en' model
Exception thrown when processing doc:
('Nida Khan\nTech Support Executive - Teleperformance for Microsoft\n\nJaipur, Rajasthan - Email me on Indeed: indeed.com/r/Nida-Khan/6c9160696f57efd8\n\n• To be an integral part of the organization and enhance my knowledge to utilize it in a productive\nmanner for the growth of the company and the global.\n\nINDUSTRIAL TRAINING\n\n• BHEL, (HEEP) HARIDWAR\nOn CNC System&amp; PLC Programming.\n\nWORK EXPERIENCE\n\nTech Support Executive\n\nTeleperformance for Microsoft -\n\nSeptember 2017 to Present\n\nprocess.\n• 21 months of experience in ADFC as Phone Banker.\n\nEDUCATION\n\nBachelor of Technology in Electronics & communication Engg\n\nGNIT institute of Technology -  Lucknow, Uttar Pradesh\n\n2008 to 2012\n\nClass XII\n\nU.P. Board -  Bareilly, Uttar Pradesh\n\n2007\n\nClass X\n\nU.P. Board -  Bareilly, Uttar Pradesh\n\n2005\n\nSKILLS\n\nMicrosoft office, excel, cisco, c language, cbs. (4 years)\n\nhttps://www.indeed.com/

In [18]:
# train test split (75% train / 25% test)
from random import randrange
def train_test_split(X,train_percent):
    train_size = train_percent * len(X)
    train = list()
    test = list(X)
    while len(train) < train_size:
        index = randrange(len(test))
        train.append(test.pop(index))    
    return train,test

train,test = train_test_split(X, 0.75)
assert (len(train) + len(test)) == len(X)  
print(len(train), len(test))

307 102


In [19]:
# training of customized spacy model
custom_nlp,_= train_spacy_ner(train,n_iter=20)

Created blank 'en' model
Losses {'ner': 26532.46284542042}
Losses {'ner': 17160.815301885246}
Losses {'ner': 39413.8851031012}
Losses {'ner': 30389.75278498046}
Losses {'ner': 47006.91409990564}
Losses {'ner': 26306.692932166625}
Losses {'ner': 31602.452643590048}
Losses {'ner': 21775.57916584192}
Losses {'ner': 16674.040124418563}
Losses {'ner': 13480.656986573726}
Losses {'ner': 12846.334227504045}
Losses {'ner': 8492.267840066816}
Losses {'ner': 8122.058526799621}
Losses {'ner': 6007.523939203747}
Losses {'ner': 6813.648012131569}
Losses {'ner': 5038.8486335094085}
Losses {'ner': 4316.808869171175}
Losses {'ner': 4421.678212810908}
Losses {'ner': 4288.890919443016}
Losses {'ner': 3699.6698804531425}


In [20]:
from spacy.gold import biluo_tags_from_offsets
import pandas as pd
from IPython.display import display, HTML

# returns a pandas dataframe with tokens, prediction, and true 
#(Gold Standard) annotations of tokens
def make_bilou_df(nlp,resume):
    # param nlp - a trained spacy model
    # param resume - a resume from our train or test set
    doc = nlp(resume[0])
    bilou_ents_predicted = biluo_tags_from_offsets(doc, 
                [(ent.start_char,ent.end_char,ent.label_)for ent in doc.ents])
    bilou_ents_true = biluo_tags_from_offsets(doc,
                [(ent[0], ent[1], ent[2]) for ent in resume[1]["entities"]])

    
    doc_tokens = [tok.text for tok in doc]
    bilou_df = pd.DataFrame()
    bilou_df["Tokens"] =doc_tokens
    bilou_df["Tokens"] = bilou_df["Tokens"].str.replace("\\s+","") 
    bilou_df["Predicted"] = bilou_ents_predicted
    bilou_df["True"] = bilou_ents_true
    return bilou_df

In [21]:
import numpy as np

def bilou_for_flair(nlp, train, test):  
    print("Make bilou dfs")
    training_data_as_bilou = [make_bilou_df(nlp,res) for res in train]
    test_data_as_bilou = [make_bilou_df(nlp,res) for res in test]
    print("Done!")
    training_file = pd.DataFrame(columns = ["text","ner"])
    test_file = pd.DataFrame(columns = ["text","ner"])

    for idx,df in enumerate(training_data_as_bilou):
        df2 = pd.DataFrame()
        df2["text"] = df["Tokens"]
        df2["ner"] = df["True"]
        training_file = training_file.append(df2)
        
    for idx, df in enumerate(test_data_as_bilou):
        df2 = pd.DataFrame()
        df2["text"] = df["Tokens"]
        df2["ner"] = df["True"]
        test_file = test_file.append(df2)
    return training_file,test_file

training,test = bilou_for_flair(custom_nlp,train,test)
print(training.shape)
print(test.shape)

with open("train_res_bilou.txt",'w+',encoding="utf-8") as f:
    training.to_csv(f,sep=" ",encoding="utf-8",index=False)
with open("test_res_bilou.txt",'w+',encoding="utf-8") as f:
    test.to_csv(f,sep=" ",encoding="utf-8",index=False)

Make bilou dfs
Done!
(236782, 2)
(71685, 2)


In [22]:
# splitting data into semantically meaningful sentences

def split_bilou_into_sentences(file_name):
    lineList = list()
    with open(file_name + '.txt', "r") as f:
        for line in f:
            lineList.append(line)
    finalList = list()

    for index, line in enumerate(lineList):
        if index == len(lineList) -1:
            break
        else:
            if line[0] == ' ' or line[0] == '.':
                # check if it is just a line break or really a new sentence!
                nextLineItem = lineList[index + 1]
                if nextLineItem.isupper() == True:
                    finalList.append('')
                else:
                    pass
            else:
                finalList.append(line)

    # writes txt-file in BILOU-Format partitioned in sentences
    with open(file_name + '_complete.txt', 'w') as g:
        for item in finalList:
            if item == '':
                g.write("%s\n" % item)
            else:
                g.write("%s" % item)

# perform splitting for training and test data
split_bilou_into_sentences('train_res_bilou')
split_bilou_into_sentences('test_res_bilou')