In [4]:
import spacy
from spacy.training import Example      #imported in scenarios where the NER model is to be fed preprocessed data
# nlp = spacy.load("en_core_web_sm")
import random

In [6]:
test = """European authorities fined Google a record $5.1 billion on Wednesday for
 abusing its power in the mobile phone market and ordered the company to alter its practices."""

In [16]:
doc = nlp(test)

print([[X.text , X.pos_] for X in doc])

[['European', 'ADJ'], ['authorities', 'NOUN'], ['fined', 'VERB'], ['Google', 'PROPN'], ['a', 'DET'], ['record', 'NOUN'], ['$', 'SYM'], ['5.1', 'NUM'], ['billion', 'NUM'], ['on', 'ADP'], ['Wednesday', 'PROPN'], ['for', 'ADP'], ['\n ', 'SPACE'], ['abusing', 'VERB'], ['its', 'PRON'], ['power', 'NOUN'], ['in', 'ADP'], ['the', 'DET'], ['mobile', 'ADJ'], ['phone', 'NOUN'], ['market', 'NOUN'], ['and', 'CCONJ'], ['ordered', 'VERB'], ['the', 'DET'], ['company', 'NOUN'], ['to', 'PART'], ['alter', 'VERB'], ['its', 'PRON'], ['practices', 'NOUN'], ['.', 'PUNCT']]


In [36]:
import json     #json file utilized
resume_ds = []
with open('datasets/resume.json', encoding = 'utf8') as data:
    # resume_ds = pd.read_json(data, lines = True) #Works for typical json files, however this file type comprises of json lines without separators
    for line in data:
        resume_ds.append(dict(json.loads(line)))        #Read each line on its own & load to memory. Number of lines can also be curated.

In [157]:
resume_ds[10]            #verify on data

{'content': "Shrishti Chauhan\nHave total work experience of 2.5 years on Oracle Fusion Middleware -\nSOA, WebLogic and MFT Module.\n\nBilaspur, Chhattisgarh - Email me on Indeed: indeed.com/r/Shrishti-\nChauhan/89d7feb4b3957524\n\nSeeking to hone and enhance my technical skills in Oracle Fusion Middleware while working as\na professional in challenging and goal oriented environment.\n\nWilling to relocate to: Bengaluru, Karnataka\n\nWORK EXPERIENCE\n\nTechnical Consultant\n\nOracle -  Bengaluru, Karnataka -\n\nOctober 2015 to Present\n\n• Have total work experience of 2.5 years on Oracle Fusion Middleware - SOA, WebLogic and\nMFT Module.\n\n• Have extensively worked on Support, Testing, Cloning, Monitoring and Maintenance support\nand Enhancement for the E-Commerce Project with multi system module.\n\n• Have good understanding on End-to-End Business Process.\n\n• Experience in developing and deploying BPEL Processes using technology adapters (DB\nAdapter, File Adapter, FTP Adapter and

In [227]:
nlp = spacy.blank('en') #create a blank entity model


# @Language.component('ner')            #decorator in cases a new component is to be created
def train_model(train_ds):
    if 'ner' not in nlp.pipe_names:     
        ner = nlp.add_pipe('ner')       #adding the ner component to the pipeline

    for item in train_ds:                  
        for entities in item['annotation']:     
            for entity in entities['label']:    #loop through the dataset to select the entity labels to be used by the component
                ner.add_label(entity[0])
                

    optimizer = nlp.begin_training()            #optimizer declaration
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):       #disable all other components in the pipeline
        for itn in range(2):                    #Number of iterations for training the model
            print('Starting iteration')
            # random.shuffle(train_ds)          #Randomize the data in the set?
            losses = {}
            index = 0
            for item in train_ds[10:20]:
                # start_end_points = [[x['start'], x['end']] for x in (y['points'][0] for y in item['annotation'])] #start & end points for the entities
                # point_labels =  [y['label'] if y['label'] != [] else ['NaN'] for y in item['annotation']]     #Entity labels
                # annotation =  []                                                #Variable to hold entity start, end & labels
                annotation = list(zip([x['start'] for x in (y['points'][0] for y in item['annotation'])],   #start values
                                        [x['end'] for x in (y['points'][0] for y in item['annotation'])],  #end values
                                        [y['label'][0] if y['label'] != [] else 'NaN' for y in item['annotation']  ]))  #labels filtering for blanks
                print(annotation)
                                    #Iterative loop to join start, end & label values
                                    # i = 0
                                    # for x in start_end_points:      #Combine the entity start, end & labels lists
                                    #     x.append(point_labels[i])
                                    #     i+=1
                                    #     annotation.append(tuple(x))
                annotation = check_annotation(annotation)       #Verify on overlapping entities & return annotation list
                print(annotation)
                text = item['content']              
                doc = nlp(item['content'])      #Resume content converted to nlp doc against which the model is to train
                example = Example.from_dict( doc, {'entities' : annotation} )
                try:
                    nlp.update(
                        [example],
                        drop = 0.2,
                        sgd = optimizer,
                        losses = losses
                        )
                except Exception as e:
                    pass
                print(losses)
                # print(content)
                # print(annotation)
        

In [228]:
def check_annotation(annotation):
    rmv_idx = []                        #list to hold annotations to be removed
    annotation = list(set(annotation))  #remove duplicate entities
    # rmv_idx = [annotation.index(x) for x in annotation for y in annotation if y[0] <= x[1] <= y[1] and x != y ]
    # for i in rmv_idx:
    #     print(annotation[i])
    for item in annotation:
        for comp_item in annotation:    #iterative loops to compare values against each other
            if comp_item != item:       #skip similar tuples
                if item[0] <= comp_item[0] <= item[1]:      #check if item has start value between start & end values of other item
                    # print('Comparative Item: \t{} \n Item: \t\t\t{}' .format(comp_item, item))
                    rmv_idx.append(annotation.index(comp_item))
                if item[0] <= comp_item[1] <= item[1]:      #check if item has end value between start & end values of other item
                    # print('Comparative Item: \t{} \n Item: \t\t\t{}' .format(comp_item, item) )
                    rmv_idx.append(annotation.index(comp_item))
                    
    for x in sorted(list(set(rmv_idx)),  reverse = True):               #iterate through the list of reverse ordered & unique values of rmv_idx  
        annotation.pop(x)                                               # pop
    return annotation
            

In [229]:
train_model(resume_ds)
# print(nlp.pipe_names)

Starting iteration
[(5365, 5370, 'Companies worked at'), (5318, 5323, 'Companies worked at'), (4676, 4681, 'Companies worked at'), (4613, 4632, 'Designation'), (4407, 4412, 'Companies worked at'), (4374, 4397, 'Companies worked at'), (4374, 4379, 'Companies worked at'), (4328, 4347, 'Designation'), (4210, 4215, 'Companies worked at'), (4059, 4064, 'Companies worked at'), (3449, 3454, 'Companies worked at'), (3386, 3405, 'Designation'), (3165, 3170, 'Companies worked at'), (3132, 3155, 'Companies worked at'), (3132, 3137, 'Companies worked at'), (3087, 3106, 'Designation'), (2756, 2761, 'Companies worked at'), (2694, 2699, 'Companies worked at'), (2665, 2688, 'Companies worked at'), (2665, 2670, 'Companies worked at'), (2585, 2590, 'NaN'), (2562, 2567, 'Skills'), (2557, 2605, 'Skills'), (2543, 2546, 'Graduation Year'), (2482, 2532, 'College Name'), (2476, 2479, 'Degree'), (2407, 2430, 'Companies worked at'), (2407, 2412, 'Companies worked at'), (2394, 2402, 'Years of Experience'), (2356

Cuddapah, Andhra Pradesh - Emai..." with entities "[(528, 553, 'Degree'), (245, 253, 'Skills'), (66, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Bengaluru, Karnataka - Email me on Ind..." with entities "[(185, 188, 'Graduation Year'), (275, 278, 'Gradua...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Hyderabad, Telangana - Email me on Indeed..." with entities "[(201, 209, 'Companies worked at'), (3567, 3586, '...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Kottayam, Kerala - Email me on Indeed..." with entities "[(13, 20, 'Location'), (52, 93, 'Email Address'), ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check

{'ner': 1918.4313832856715}
[(2165, 2214, 'Skills'), (2145, 2154, 'Degree'), (1789, 1799, 'Companies worked at'), (1305, 1346, 'Email Address'), (176, 206, 'Designation'), (52, 93, 'Email Address'), (13, 20, 'Location'), (0, 11, 'Name')]
[(13, 20, 'Location'), (52, 93, 'Email Address'), (2145, 2154, 'Degree'), (1789, 1799, 'Companies worked at'), (176, 206, 'Designation'), (2165, 2214, 'Skills'), (0, 11, 'Name'), (1305, 1346, 'Email Address')]
{'ner': 2213.470371250063}
[(2833, 4399, 'Skills'), (2819, 2822, 'Graduation Year'), (2781, 2817, 'College Name'), (2766, 2778, 'Degree'), (1645, 1673, 'Designation'), (1596, 1600, 'Location'), (133, 141, 'Years of Experience'), (85, 129, 'Email Address'), (45, 49, 'Location'), (15, 43, 'Designation'), (0, 13, 'Name')]
[(2781, 2817, 'College Name'), (2819, 2822, 'Graduation Year'), (2766, 2778, 'Degree'), (133, 141, 'Years of Experience'), (1596, 1600, 'Location'), (85, 129, 'Email Address'), (0, 13, 'Name'), (45, 49, 'Location'), (15, 43, 'Desig

Software Automation Engineer

Pune,..." with entities "[(2781, 2817, 'College Name'), (2819, 2822, 'Gradu...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


{'ner': 3342.4449939765036}
[(3086, 3254, 'Skills'), (3046, 3057, 'Skills'), (2981, 3009, 'College Name'), (2975, 2978, 'Degree'), (2969, 2972, 'Graduation Year'), (2938, 2966, 'College Name'), (2932, 2935, 'Degree'), (2926, 2929, 'Graduation Year'), (2869, 2923, 'College Name'), (2846, 2867, 'Degree'), (2749, 2752, 'Graduation Year'), (2614, 2692, 'Skills'), (2425, 2429, 'Location'), (2418, 2422, 'Location'), (2385, 2400, 'Designation'), (2220, 2224, 'Location'), (837, 848, 'Skills'), (332, 343, 'Companies worked at'), (311, 326, 'Designation'), (221, 232, 'Skills'), (138, 169, 'Degree'), (81, 124, 'Email Address'), (53, 57, 'Location'), (46, 50, 'Location'), (32, 43, 'Companies worked at'), (14, 29, 'Designation'), (0, 12, 'Name')]
[(2975, 2978, 'Degree'), (2425, 2429, 'Location'), (46, 50, 'Location'), (32, 43, 'Companies worked at'), (2869, 2923, 'College Name'), (2614, 2692, 'Skills'), (14, 29, 'Designation'), (311, 326, 'Designation'), (221, 232, 'Skills'), (2938, 2966, 'College 

Systems Engineer - Infosys Ltd

Delh..." with entities "[(2975, 2978, 'Degree'), (2425, 2429, 'Location'),...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


{'ner': 3886.2472329176962}
[(1963, 2093, 'Skills'), (1818, 1925, 'Skills'), (1741, 1758, 'College Name'), (1732, 1738, 'Degree'), (1681, 1699, 'College Name'), (1605, 1648, 'College Name'), (1586, 1602, 'Degree'), (1577, 1581, 'Degree'), (1520, 1546, 'College Name'), (1503, 1517, 'Degree'), (1478, 1494, 'Degree'), (1465, 1475, 'Degree'), (35, 79, 'Email Address'), (0, 12, 'Name')]
[(1503, 1517, 'Degree'), (1520, 1546, 'College Name'), (1586, 1602, 'Degree'), (1577, 1581, 'Degree'), (1478, 1494, 'Degree'), (1963, 2093, 'Skills'), (1818, 1925, 'Skills'), (0, 12, 'Name'), (1465, 1475, 'Degree'), (35, 79, 'Email Address'), (1605, 1648, 'College Name'), (1741, 1758, 'College Name'), (1681, 1699, 'College Name'), (1732, 1738, 'Degree')]


- Email me on Indeed: indeed.com/r/M..." with entities "[(1503, 1517, 'Degree'), (1520, 1546, 'College Nam...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


{'ner': 4199.974818233401}
[(2246, 2572, 'Skills'), (1435, 1479, 'Email Address'), (875, 963, 'Skills'), (861, 864, 'Graduation Year'), (837, 855, 'College Name'), (773, 829, 'Degree'), (767, 770, 'Graduation Year'), (743, 761, 'College Name'), (714, 740, 'Degree'), (271, 279, 'Location'), (233, 251, 'Designation'), (96, 140, 'Email Address'), (53, 61, 'Location'), (38, 51, 'Companies worked at'), (16, 34, 'Designation'), (0, 14, 'Name')]
[(38, 51, 'Companies worked at'), (16, 34, 'Designation'), (2246, 2572, 'Skills'), (773, 829, 'Degree'), (861, 864, 'Graduation Year'), (233, 251, 'Designation'), (1435, 1479, 'Email Address'), (743, 761, 'College Name'), (875, 963, 'Skills'), (714, 740, 'Degree'), (53, 61, 'Location'), (0, 14, 'Name'), (767, 770, 'Graduation Year'), (96, 140, 'Email Address'), (271, 279, 'Location'), (837, 855, 'College Name')]


Data Analyst Intern - Oracle Retai..." with entities "[(38, 51, 'Companies worked at'), (16, 34, 'Design...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


{'ner': 4523.294761124998}
[(2349, 2471, 'Skills'), (2331, 2339, 'Degree'), (1811, 1847, 'Email Address'), (921, 940, 'Companies worked at'), (783, 791, 'Location'), (733, 756, 'Designation'), (96, 132, 'Email Address'), (64, 72, 'Location'), (53, 61, 'Location'), (8, 50, 'Designation'), (0, 6, 'Name')]
[(783, 791, 'Location'), (733, 756, 'Designation'), (2331, 2339, 'Degree'), (53, 61, 'Location'), (64, 72, 'Location'), (96, 132, 'Email Address'), (8, 50, 'Designation'), (921, 940, 'Companies worked at'), (1811, 1847, 'Email Address'), (2349, 2471, 'Skills'), (0, 6, 'Name')]


Arabic Language supporter (Content Analyst..." with entities "[(783, 791, 'Location'), (733, 756, 'Designation')...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


{'ner': 5040.632188264281}
Starting iteration
[(5365, 5370, 'Companies worked at'), (5318, 5323, 'Companies worked at'), (4676, 4681, 'Companies worked at'), (4613, 4632, 'Designation'), (4407, 4412, 'Companies worked at'), (4374, 4397, 'Companies worked at'), (4374, 4379, 'Companies worked at'), (4328, 4347, 'Designation'), (4210, 4215, 'Companies worked at'), (4059, 4064, 'Companies worked at'), (3449, 3454, 'Companies worked at'), (3386, 3405, 'Designation'), (3165, 3170, 'Companies worked at'), (3132, 3155, 'Companies worked at'), (3132, 3137, 'Companies worked at'), (3087, 3106, 'Designation'), (2756, 2761, 'Companies worked at'), (2694, 2699, 'Companies worked at'), (2665, 2688, 'Companies worked at'), (2665, 2670, 'Companies worked at'), (2585, 2590, 'NaN'), (2562, 2567, 'Skills'), (2557, 2605, 'Skills'), (2543, 2546, 'Graduation Year'), (2482, 2532, 'College Name'), (2476, 2479, 'Degree'), (2407, 2430, 'Companies worked at'), (2407, 2412, 'Companies worked at'), (2394, 2402, 'Y

# Learnt Concepts

In [None]:
#print([x['points'] for x in item['annotation']])
# print([(start, end) for [start,end] in x['start','end'] for x in y['points'] for y in item['annotation']])
# print([((x['start'], x['end'] ) for x in (y['points'][0] for y in item['annotation']), [y['label'][0] for y in item['annotation']])])
# print([y['label'][0] for y in item['annotation']])

In [26]:

#Utilizing list comprehension foe the check_annotation method
def check_annotation_(annotation):
    rmv_idx = []                    #list to hold annotations to be removed

    rmv_idx = [annotation.index(x) for x in annotation for y in annotation if y[0] <= x[1] <= y[1] and x != y ]
    for i in rmv_idx:
        print(annotation[i])
    # for item in annotation:
    #     for comp_item in annotation:    #iterative loops to compare values against each other
    #         if comp_item != item:       #skip similar tuples
    #             if item[0] <= comp_item[0] <= item[1]:      #check if item has start value between start & end values of other item
    #                 print('Comparative Item: \t{} \n Item: \t\t\t{}' .format(comp_item, item))
    #                 rmv_idx.append(annotation.index(comp_item))
    #             if item[0] <= comp_item[1] <= item[1]:      #check if item has end value between start & end values of other item
    #                 print('Comparative Item: \t{} \n Item: \t\t\t{}' .format(comp_item, item) )
    #                 rmv_idx.append(annotation.index(comp_item))
                    
    for x in sorted(list(set(rmv_idx)),  reverse = True):              #iterate through the list of reverse ordered & unique values of rmv_idx  
        annotation.pop(x)                                               # pop
    return annotation

for item in resume_ds[0:1]:
    start_end_points = [[x['start'], x['end']] for x in (y['points'][0] for y in item['annotation'])] #start & end points for the entities
    point_labels =  [y['label'][0] for y in item['annotation']]     #Entity labels
    annotation =  []                                                #Variable to hold entity start, end & labels
    text = item['content']              
    i = 0
    for x in start_end_points:      #Combine the entity start, end & labels lists
        x.append(point_labels[i])
        i+=1
        annotation.append(tuple(x))
    annotation = check_annotation_(annotation)
    print(annotation)
        
            

(1749, 1754, 'Companies worked at')
(1696, 1701, 'Companies worked at')
(1417, 1422, 'Companies worked at')
(1209, 1214, 'Companies worked at')
[(1356, 1792, 'Skills'), (1136, 1247, 'Skills'), (928, 931, 'Graduation Year'), (858, 888, 'College Name'), (821, 855, 'Degree'), (787, 790, 'Graduation Year'), (744, 749, 'Companies worked at'), (722, 741, 'Designation'), (658, 663, 'Companies worked at'), (640, 655, 'Designation'), (574, 579, 'Companies worked at'), (555, 572, 'Designation'), (470, 492, 'Companies worked at'), (444, 468, 'Designation'), (308, 313, 'Companies worked at'), (234, 239, 'Companies worked at'), (175, 197, 'Companies worked at'), (93, 136, 'Email Address'), (39, 47, 'Location'), (13, 37, 'Designation'), (0, 11, 'Name')]
