In [1]:
import spacy
from spacy.training import Example      #imported in scenarios where the NER model is to be fed preprocessed data
# nlp = spacy.load("en_core_web_sm")
import random

In [6]:
test = """European authorities fined Google a record $5.1 billion on Wednesday for
 abusing its power in the mobile phone market and ordered the company to alter its practices."""

In [16]:
doc = nlp(test)

print([[X.text , X.pos_] for X in doc])

[['European', 'ADJ'], ['authorities', 'NOUN'], ['fined', 'VERB'], ['Google', 'PROPN'], ['a', 'DET'], ['record', 'NOUN'], ['$', 'SYM'], ['5.1', 'NUM'], ['billion', 'NUM'], ['on', 'ADP'], ['Wednesday', 'PROPN'], ['for', 'ADP'], ['\n ', 'SPACE'], ['abusing', 'VERB'], ['its', 'PRON'], ['power', 'NOUN'], ['in', 'ADP'], ['the', 'DET'], ['mobile', 'ADJ'], ['phone', 'NOUN'], ['market', 'NOUN'], ['and', 'CCONJ'], ['ordered', 'VERB'], ['the', 'DET'], ['company', 'NOUN'], ['to', 'PART'], ['alter', 'VERB'], ['its', 'PRON'], ['practices', 'NOUN'], ['.', 'PUNCT']]


In [2]:
import json     #json file utilized
resume_ds = []
with open('datasets/resume.json', encoding = 'utf8') as data:
    # resume_ds = pd.read_json(data, lines = True) #Works for typical json files, however this file type comprises of json lines without separators
    for line in data:
        resume_ds.append(dict(json.loads(line)))        #Read each line on its own & load to memory. Number of lines can also be curated.

In [30]:
resume_ds[0]            #verify on data

{'content': 'Govardhana K\nSenior Software Engineer\n\nBengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/\nb2de315d95905b68\n\nTotal IT experience 5 Years 6 Months\nCloud Lending Solutions INC 4 Month • Salesforce Developer\nOracle 5 Years 2 Month • Core Java Developer\nLanguages Core Java, Go Lang\nOracle PL-SQL programming,\nSales Force Developer with APEX.\n\nDesignations & Promotions\n\nWilling to relocate: Anywhere\n\nWORK EXPERIENCE\n\nSenior Software Engineer\n\nCloud Lending Solutions -  Bangalore, Karnataka -\n\nJanuary 2018 to Present\n\nPresent\n\nSenior Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2016 to December 2017\n\nStaff Consultant\n\nOracle -  Bangalore, Karnataka -\n\nJanuary 2014 to October 2016\n\nAssociate Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2012 to December 2013\n\nEDUCATION\n\nB.E in Computer Science Engineering\n\nAdithya Institute of Technology -  Tamil Nadu\n\nSeptember 2008 to June 2012\n\n

In [217]:
nlp = spacy.blank('en') #create a blank entity model


# @Language.component('ner')            #decorator in cases a new component is to be created
def train_model(train_ds):
    if 'ner' not in nlp.pipe_names:     
        ner = nlp.add_pipe('ner')       #adding the ner component to the pipeline

    for item in train_ds[0:2]:                  
        for entities in item['annotation']:     
            for entity in entities['label']:    #loop through the dataset to select the entity labels to be used by the component
                ner.add_label(entity[0])
                

        optimizer = nlp.begin_training()            #optimizer declaration
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
        with nlp.disable_pipes(*other_pipes):       #disable all other components in the pipeline
            for itn in range(2):                    #Number of iterations for training the model
                print('Starting iteration')
                random.shuffle(train_ds)          #Randomize the data in the set?
                losses = {}
                index = 0
                for item in train_ds[0:1]:
                    start_end_points = [[x['start'], x['end']] for x in (y['points'][0] for y in item['annotation'])] #start & end points for the entities
                    point_labels =  [y['label'][0] for y in item['annotation']]     #Entity labels
                    annotation =  []                                                #Variable to hold entity start, end & labels
                text = item['content']              
                    doc = nlp(item['content'])      #Resume content converted to nlp doc against which the model is to train
                    i = 0
                    for x in start_end_points:      #Combine the entity start, end & labels lists
                        x.append(point_labels[i])
                        i+=1
                        annotation.append(tuple(x))
                    annotation = check_annotation(annotation)       #Verify on overlapping entities & return annotation list
                    example = Example.from_dict( doc, {'entities' : annotation} )
                    try:
                        nlp.update(
                            [example],
                            drop = 0.2,
                            sgd = optimizer,
                            losses = losses
                        )
                    except Exception as e:
                        pass
                    print(losses)
                    # print(content)
                    # print(annotation)
            

In [218]:
def check_annotation(annotation):
    rmv_idx = []                    list to hold annotations to be removed
    for item in annotation:
        for comp_item in annotation:    #iterative loops to compare values against each other
            if comp_item != item:       #skip similar tuples
                if item[0] <= comp_item[0] <= item[1]:      #check if item has start value between start & end values of other item
                    print('Comparative Item: \t{} \n Item: \t\t\t{}' .format(comp_item, item))
                        rmv_idx.append(annotation.index(comp_item))
                if item[0] <= comp_item[1] <= item[1]:      #check if item has end value between start & end values of other item
                    print('Comparative Item: \t{} \n Item: \t\t\t{}' .format(comp_item, item) )
                    rmv_idx.append(annotation.index(comp_item))
                    
    for x in sorted(list(set(rmv_idx)),  reverse = True):              #iterate through the list of reverse ordered & unique values of rmv_idx  
        annotation.pop(x)                                               # pop
    return annotation
        
            

In [219]:
train_model(resume_ds)
# print(nlp.pipe_names)

Starting iteration
{'ner': 59.76470053754747}
Starting iteration
{'ner': 194.71724307350814}


Kolkata, West Bengal - Email me on In..." with entities "[(319, 337, 'Skills'), (289, 295, 'Location'), (28...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Microsoft Office Suite - Exper..." with entities "[(785, 955, 'Skills'), (643, 673, 'Designation'), ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


Starting iteration


Principal Consultant at Oracle

Bengalu..." with entities "[(3345, 3895, 'Skills'), (3276, 3312, 'College Nam...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


{'ner': 565.7895410060883}
Starting iteration


"Store Executive" - Orange City Ho..." with entities "[(6994, 7349, 'Skills'), (6936, 6972, 'College Nam...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


{'ner': 1423.0436115264893}


In [71]:
check_annotation(resume_ds)

Comparative Item: 	(1749, 1754, 'Companies worked at') 
 Item: 			(1356, 1792, 'Skills')
Comparative Item: 	(1417, 1422, 'Companies worked at') 
 Item: 			(1356, 1792, 'Skills')
[(1696, 1701, 'Companies worked at'), (1356, 1792, 'Skills'), (1209, 1214, 'Companies worked at'), (1136, 1247, 'Skills'), (928, 931, 'Graduation Year'), (858, 888, 'College Name'), (821, 855, 'Degree'), (787, 790, 'Graduation Year'), (744, 749, 'Companies worked at'), (722, 741, 'Designation'), (658, 663, 'Companies worked at'), (640, 655, 'Designation'), (574, 579, 'Companies worked at'), (555, 572, 'Designation'), (470, 492, 'Companies worked at'), (444, 468, 'Designation'), (308, 313, 'Companies worked at'), (234, 239, 'Companies worked at'), (175, 197, 'Companies worked at'), (93, 136, 'Email Address'), (39, 47, 'Location'), (13, 37, 'Designation'), (0, 11, 'Name')]


In [322]:
[x['annotation'] for x in resume_ds]

[[{'label': ['Skills'],
   'points': [{'start': 2348,
     'end': 3130,
     'text': 'CRM (3 years), DATABASE (3 years), ORACLE (3 years), Tosca (3 years), Automation Testing (3\nyears), Selenium (1 year), Core Java (1 year)\n\nADDITIONAL INFORMATION\n\nKey Skills:\n❖ Software tools: IBM Rational Collaborative Lifecycle Management\n❖ Testing Tool: IBM Rational Quality Management on Jazz Server\n❖ Test Automation Tools: TOSCA, Selenium\n❖ Programming Language: Core Java\n❖ IDE: Eclipse\n❖ Database: Oracle, EDB, Sqlserver\n❖ Database Tools: SQL Developer, Toad, Tora\n❖ Software tools: Filezilla, MobaXterm, Putty, Office tools\n❖ Platforms: Windows, UNIX\n❖ Domain Software Knowledge: Finacle Core Banking Solution, Finacle CRM Solution.\n\nSkills: Fast learner, leadership quality, team player, presentation skills, work devotee, punctual,\ngood communication and listening skills.'}]},
  {'label': ['Graduation Year'],
   'points': [{'start': 2324, 'end': 2327, 'text': '2005'}]},
  {'label': 

# Learnt Concepts

In [None]:
#print([x['points'] for x in item['annotation']])
# print([(start, end) for [start,end] in x['start','end'] for x in y['points'] for y in item['annotation']])
# print([((x['start'], x['end'] ) for x in (y['points'][0] for y in item['annotation']), [y['label'][0] for y in item['annotation']])])
# print([y['label'][0] for y in item['annotation']])