### <span style="color:#800000">NER using spaCy</span>

### <span style="color:#FF00FF">Import libraries</span>

In [1]:
import pandas as pd
import ast
import pickle

import spacy
from spacy import displacy
from spacy.training import Example
from spacy.util import minibatch, compounding

from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

### <span style="color:#FF00FF">Load the dataset</span>

In [2]:
# Reading data and creating the dataframe 
df = pd.read_csv("../data/jobdescriptions_train.csv")

# Checking the first 5 rows of the data
df.head()

Unnamed: 0,domain,link,title,company,experience,salary,location,description,role,industry_type,qualification,functional_area,employment_type,role_category,skills,clean_description,tags
0,information-technology-jobs,https://www.naukri.com/job-listings-head-infor...,Head - Information Technology,Bright Consultant,15 - 25 years,Not Disclosed,Delhi / NCR,: - manage all it verticals of the company li...,"Head/VP/GM-Technology(IT)/CTO,","Industrial Products, Heavy Machinery,","['UG :Any Graduate in Any Specialization', 'PG...","IT Software - Network Administration, Security,","Full Time, Permanent",Senior Management,"['Network Security', 'Information Security', '...",manage verticals company like applications inf...,"{'entities': [(323, 331, 'DEPT'), (30, 42, 'SK..."
1,information-technology-jobs,https://www.naukri.com/job-listings-urgent-ope...,Urgent Opening - Sr. Executive- IT (informatio...,Ecom Express Private Limited,4 - 9 years,"₹ 2,50,000 - 4,50,000 P.A.",Ahmedabad,"hi, greetings from ecom express pvt ltd!!! we ...","IT/Networking-Manager,","Courier, Transportation, Freight , Warehousing,",['UG :Any Graduate in Any Specialization'],"IT Software - Application Programming, Mainten...","Full Time, Permanent",Admin/Maintenance/Security/Datawarehousing,"['LAN Troubleshooting', 'Hardware Networking',...",hi greetings ecom express pvt ltd urgent openi...,"{'entities': [(394, 411, 'ROLE'), (602, 610, '..."
2,information-technology-jobs,https://www.naukri.com/job-listings-informatio...,Information Technology (IT) & Business Solutio...,Procter & Gamble,8 - 10 years,Not Disclosed,Mumbai,do you want to join our team of women and m...,"Project Manager-IT/Software,","FMCG, Foods, Beverage,","['UG :BCA in Computers, B.Tech/B.E. in Compute...","IT Software - Application Programming, Mainten...","Full Time, Permanent",Project Management,"['Application Development', 'Agile Project Man...",want join team women men professionals apply s...,"{'entities': [(2005, 2021, 'DEPT'), (1954, 197..."
3,information-technology-jobs,https://www.naukri.com/job-listings-senior-sol...,Senior Solution Manager - Information Technology,Procter & Gamble,8 - 12 years,Not Disclosed,Mumbai,"solutions manager leads product roadmap, ...","IT/Networking-Manager,","FMCG, Foods, Beverage,","['UG :Any Graduate in Any Specialization', 'PG...","IT Software - Application Programming, Mainten...","Full Time, Permanent",Admin/Maintenance/Security/Datawarehousing,"['Agile Methodology', 'Fund Management', 'Prod...",solutions manager leads product roadmap create...,"{'entities': [(893, 913, 'SKILL'), (1061, 1075..."
4,information-technology-jobs,https://www.naukri.com/job-listings-informatio...,Information Technology,Stefanini,12 - 22 years,Not Disclosed,Noida,we are looking for someone who is passionat...,"System Administrator,","Recruitment, Staffing,","['UG :Any Graduate in Any Specialization', 'PG...","IT Software - Application Programming, Mainten...","Full Time, Permanent",Admin/Maintenance/Security/Datawarehousing,"['SAN', 'Automation', 'Debugging', 'NetBackup'...",looking someone passionate emerging technologi...,"{'entities': [(3277, 3297, 'SKILL'), (2428, 24..."


### <span style="color:#FF00FF">Training spaCy NER model with Custom Entities</span>

In [3]:
TRAIN_DATA = df[['clean_description','tags']].to_numpy()

In [4]:
# Import and load the spacy model
nlp=spacy.load("en_core_web_sm") 

# Getting the ner component
ner=nlp.get_pipe('ner')

# Adding labels to the `ner`
ner.add_label('SKILL')
ner.add_label('ROLE')
ner.add_label('DEPT')

# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [5]:
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        nlp.add_pipe('ner', last=True)
       

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            #random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                examples = []
                for text, annots in batch:
                    annots = ast.literal_eval(annots)
                    examples.append(Example.from_dict(nlp.make_doc(text), annots))
                    nlp.update(examples,losses=losses)
                print("Losses", losses) 
    return nlp

In [6]:
jdnlp = train_spacy(TRAIN_DATA, iterations = 20)

Statring iteration 0
Losses {'ner': 1342.587233364582}
Losses {'ner': 2743.3319696187973}
Losses {'ner': 3096.3558551976603}
Losses {'ner': 3248.51164596506}
Losses {'ner': 3485.070929487367}
Losses {'ner': 3577.0816759230947}
Losses {'ner': 3672.19098782129}
Losses {'ner': 3848.342194174955}
Losses {'ner': 4037.342335229405}
Losses {'ner': 4217.378909881765}
Losses {'ner': 4223.253225199043}
Losses {'ner': 4586.481605743429}
Losses {'ner': 4806.066182743514}
Losses {'ner': 4999.260286738764}
Losses {'ner': 5206.701077337718}
Losses {'ner': 5378.515542241307}
Losses {'ner': 5519.624816685284}
Losses {'ner': 5678.549432656981}
Losses {'ner': 5964.945157788162}
Losses {'ner': 6156.00424803289}
Losses {'ner': 6259.221911209092}
Losses {'ner': 6345.93847689692}
Losses {'ner': 6425.6806435405815}
Losses {'ner': 6530.002227845231}
Losses {'ner': 6581.830454928604}
Losses {'ner': 6651.495613972318}
Losses {'ner': 6699.395460488064}
Losses {'ner': 6733.735351793318}
Losses {'ner': 7042.6800613

Losses {'ner': 22206.325484215537}
Losses {'ner': 22206.38918000078}
Losses {'ner': 22243.784997221555}
Losses {'ner': 22371.97039530121}
Losses {'ner': 22723.369006428547}
Losses {'ner': 22845.282118808358}
Losses {'ner': 22908.702955355882}
Losses {'ner': 22983.025613905676}
Losses {'ner': 23037.91380912386}
Losses {'ner': 23327.31329493604}
Losses {'ner': 23427.833751988717}
Losses {'ner': 23509.217903332705}
Losses {'ner': 23741.39615825842}
Losses {'ner': 23804.741285641558}
Losses {'ner': 23872.600346320407}
Losses {'ner': 23979.758151142294}
Losses {'ner': 24062.726005040236}
Losses {'ner': 24129.631004898525}
Losses {'ner': 24240.236282358655}
Losses {'ner': 24263.37152079195}
Losses {'ner': 24330.549556522114}
Losses {'ner': 24621.31388380903}
Losses {'ner': 24644.097635435373}
Losses {'ner': 24778.3135109937}
Losses {'ner': 24799.853065368374}
Losses {'ner': 24984.85482697691}
Losses {'ner': 25070.749004105925}
Losses {'ner': 25279.459913534964}
Losses {'ner': 25407.846304736

Losses {'ner': 12175.294177621501}
Losses {'ner': 12183.642427898209}
Losses {'ner': 12194.38608746287}
Losses {'ner': 12261.455792745626}
Losses {'ner': 12289.762946933957}
Losses {'ner': 12317.181416179084}
Losses {'ner': 12337.111749583899}
Losses {'ner': 12360.880948768283}
Losses {'ner': 12406.524340296224}
Losses {'ner': 12480.129149877695}
Losses {'ner': 12528.046310062342}
Losses {'ner': 12603.010383316798}
Losses {'ner': 12645.457340349443}
Losses {'ner': 12718.202847287072}
Losses {'ner': 12745.728090210223}
Losses {'ner': 12823.92232329899}
Losses {'ner': 12877.24862485752}
Losses {'ner': 12927.994478640776}
Losses {'ner': 12963.38930690668}
Losses {'ner': 12982.365455294186}
Losses {'ner': 13015.88488397827}
Losses {'ner': 13099.555506476714}
Losses {'ner': 13121.161000962382}
Losses {'ner': 13176.373758159563}
Losses {'ner': 13205.84397271147}
Losses {'ner': 13263.36367336471}
Losses {'ner': 13292.102254936835}
Losses {'ner': 13314.800516248531}
Losses {'ner': 13407.105159

Losses {'ner': 8743.80310321634}
Losses {'ner': 8753.147487495547}
Losses {'ner': 8765.792435110863}
Losses {'ner': 8773.700318381443}
Losses {'ner': 8801.490113932487}
Losses {'ner': 8802.56667469233}
Losses {'ner': 8840.921455961397}
Losses {'ner': 8841.980709713012}
Losses {'ner': 8867.978470466756}
Losses {'ner': 8919.996658311611}
Losses {'ner': 8959.095049989015}
Losses {'ner': 8994.890498955205}
Losses {'ner': 8998.801388518916}
Losses {'ner': 9003.416356025653}
Losses {'ner': 9033.19398326519}
Losses {'ner': 9039.885609884068}
Losses {'ner': 9066.899025422143}
Losses {'ner': 9076.461709817879}
Losses {'ner': 9111.001058995244}
Losses {'ner': 9138.943796857626}
Losses {'ner': 9141.609777315825}
Losses {'ner': 9150.019551588799}
Losses {'ner': 9185.070625400542}
Losses {'ner': 9204.737722647007}
Losses {'ner': 9223.486365634944}
Losses {'ner': 9233.802143576359}
Losses {'ner': 9239.39855253283}
Losses {'ner': 9267.268247232998}
Losses {'ner': 9292.066938390391}
Losses {'ner': 929

Losses {'ner': 5111.669490962319}
Losses {'ner': 5118.702278620499}
Losses {'ner': 5146.126444557542}
Losses {'ner': 5183.3529844936465}
Losses {'ner': 5203.384178665903}
Losses {'ner': 5223.596673980292}
Losses {'ner': 5231.606337122792}
Losses {'ner': 5242.041051285384}
Losses {'ner': 5272.023289344936}
Losses {'ner': 5338.860314912484}
Losses {'ner': 5392.99624638411}
Losses {'ner': 5412.297228194542}
Losses {'ner': 5432.147513685288}
Losses {'ner': 5452.1277284193775}
Losses {'ner': 5455.519896026227}
Losses {'ner': 5463.314610181439}
Losses {'ner': 5480.5494453399915}
Losses {'ner': 5521.3106069881105}
Losses {'ner': 5546.781824975957}
Losses {'ner': 5588.81141646703}
Losses {'ner': 5630.414348741553}
Losses {'ner': 5676.899202238712}
Losses {'ner': 5723.743781557922}
Losses {'ner': 5834.793567405397}
Losses {'ner': 5876.818308591127}
Losses {'ner': 6005.03555111987}
Losses {'ner': 6137.588147491262}
Losses {'ner': 6171.0473223511935}
Losses {'ner': 6266.360767801994}
Losses {'ner

Losses {'ner': 2849.3659760999844}
Losses {'ner': 2891.9593609939284}
Losses {'ner': 3107.6881119695063}
Losses {'ner': 3327.253890132122}
Losses {'ner': 3389.6878188604037}
Losses {'ner': 3409.190629565735}
Losses {'ner': 3446.662266745227}
Losses {'ner': 3512.9574009296953}
Losses {'ner': 3569.421637091916}
Losses {'ner': 3605.488417218432}
Losses {'ner': 3616.3251104339297}
Losses {'ner': 3720.852369961212}
Losses {'ner': 3748.3469210373255}
Losses {'ner': 3797.6878704073183}
Losses {'ner': 3803.455160537523}
Losses {'ner': 3804.4492321503894}
Losses {'ner': 3804.4680434395686}
Losses {'ner': 3859.377248306953}
Losses {'ner': 3863.31803795101}
Losses {'ner': 3874.7826883708744}
Losses {'ner': 3886.202060479516}
Losses {'ner': 3906.26869435227}
Losses {'ner': 3931.8544037617885}
Losses {'ner': 3967.865814031047}
Losses {'ner': 3976.6254269771434}
Losses {'ner': 4028.0979871040117}
Losses {'ner': 4069.4442597050165}
Losses {'ner': 4093.8721730128104}
Losses {'ner': 4099.570855720352}


Losses {'ner': 1020.7109911361237}
Losses {'ner': 1026.545619582179}
Losses {'ner': 1026.845780467046}
Losses {'ner': 1046.2996805336666}
Losses {'ner': 1046.3045595802037}
Losses {'ner': 1046.304559581343}
Losses {'ner': 1047.696885209067}
Losses {'ner': 1072.879198513271}
Losses {'ner': 1098.9375134066388}
Losses {'ner': 1108.7782492518793}
Losses {'ner': 1187.8627488322147}
Losses {'ner': 1208.5788434358303}
Losses {'ner': 1285.529864097589}
Losses {'ner': 1427.1453041204804}
Losses {'ner': 1433.295525093328}
Losses {'ner': 1449.7026109996573}
Losses {'ner': 1453.5778653755099}
Losses {'ner': 1509.1770197063695}
Losses {'ner': 1593.3183768024535}
Losses {'ner': 1644.3974016162047}
Losses {'ner': 1647.3075609316838}
Losses {'ner': 1683.0706890976685}
Losses {'ner': 1699.718939049083}
Losses {'ner': 1700.6535575197472}
Losses {'ner': 1772.9676305974672}
Losses {'ner': 1834.6492502697315}
Losses {'ner': 1901.0409244912755}
Losses {'ner': 1974.177107975821}
Losses {'ner': 2021.400746894

Losses {'ner': 6435.695123636567}
Losses {'ner': 6456.99670188986}
Losses {'ner': 6463.506961899503}
Statring iteration 6
Losses {'ner': 47.48258463885989}
Losses {'ner': 108.94965768121689}
Losses {'ner': 152.94353516477082}
Losses {'ner': 169.97100091501437}
Losses {'ner': 183.3883814824589}
Losses {'ner': 198.70131044321477}
Losses {'ner': 205.7174472862967}
Losses {'ner': 220.6037108808361}
Losses {'ner': 253.3983211457337}
Losses {'ner': 273.1949953012851}
Losses {'ner': 273.195005061483}
Losses {'ner': 330.9409228547301}
Losses {'ner': 377.9205858462852}
Losses {'ner': 382.8213226907213}
Losses {'ner': 427.3488187806056}
Losses {'ner': 437.9225387408838}
Losses {'ner': 439.86070768261646}
Losses {'ner': 469.7886729914499}
Losses {'ner': 492.84587104761124}
Losses {'ner': 501.2044451342256}
Losses {'ner': 509.5827235523815}
Losses {'ner': 540.8686783727132}
Losses {'ner': 569.0567387117613}
Losses {'ner': 577.0441424775017}
Losses {'ner': 605.9085086795669}
Losses {'ner': 624.7655

Losses {'ner': 4662.000386249365}
Losses {'ner': 4680.771596708646}
Losses {'ner': 4680.77159851563}
Losses {'ner': 4686.993910193202}
Losses {'ner': 4759.013291606767}
Losses {'ner': 4841.211358539658}
Losses {'ner': 4853.398753830188}
Losses {'ner': 4857.286294259293}
Losses {'ner': 4883.204334911679}
Losses {'ner': 4883.316798812759}
Losses {'ner': 4937.334891603442}
Losses {'ner': 4941.362732540479}
Losses {'ner': 4953.445986638177}
Losses {'ner': 4975.690192796277}
Losses {'ner': 4978.983423710389}
Losses {'ner': 4990.055971040329}
Losses {'ner': 4991.9258671926}
Losses {'ner': 4991.979626756633}
Losses {'ner': 5008.826971517254}
Losses {'ner': 5032.260254193635}
Losses {'ner': 5035.896317973349}
Losses {'ner': 5038.268841001283}
Losses {'ner': 5058.269064296895}
Losses {'ner': 5061.817255385139}
Losses {'ner': 5074.259874632324}
Losses {'ner': 5077.247018646962}
Losses {'ner': 5102.568812378033}
Losses {'ner': 5105.680424067787}
Losses {'ner': 5144.1653103962235}
Losses {'ner': 5

Losses {'ner': 3474.474848133116}
Losses {'ner': 3474.477936973538}
Losses {'ner': 3476.3917636045003}
Losses {'ner': 3478.8367881194436}
Losses {'ner': 3492.8754071772278}
Losses {'ner': 3497.007776762626}
Losses {'ner': 3512.5252336138055}
Losses {'ner': 3522.621375855549}
Losses {'ner': 3546.215451647321}
Losses {'ner': 3573.280703812585}
Losses {'ner': 3585.221406667305}
Losses {'ner': 3585.494618237442}
Losses {'ner': 3617.2769063028936}
Losses {'ner': 3621.917224462115}
Losses {'ner': 3646.0905981673873}
Losses {'ner': 3651.3237390242857}
Losses {'ner': 3683.5411575093}
Losses {'ner': 3709.518821877063}
Losses {'ner': 3732.690957376958}
Losses {'ner': 3740.142463631432}
Losses {'ner': 3754.8439266991445}
Losses {'ner': 3757.2314000999486}
Losses {'ner': 3785.017563380981}
Losses {'ner': 3788.5153479069872}
Losses {'ner': 3819.5248351040673}
Losses {'ner': 3838.2353838278796}
Losses {'ner': 3857.0008159640006}
Losses {'ner': 3858.300474332807}
Losses {'ner': 3861.265447004026}
Los

Losses {'ner': 2952.7088002597675}
Losses {'ner': 2953.394030069296}
Losses {'ner': 2953.559288595939}
Losses {'ner': 2954.989623997138}
Losses {'ner': 2961.1108246407584}
Losses {'ner': 2962.587886247007}
Losses {'ner': 2968.6043852846065}
Losses {'ner': 2970.45154882124}
Losses {'ner': 2974.7915921962526}
Losses {'ner': 2992.0264995001744}
Losses {'ner': 2999.261929649991}
Losses {'ner': 3000.559374944698}
Losses {'ner': 3000.6849598836866}
Losses {'ner': 3002.277265827564}
Losses {'ner': 3017.7624662092676}
Losses {'ner': 3017.765520158538}
Losses {'ner': 3017.779469639471}
Losses {'ner': 3018.9228717989663}
Losses {'ner': 3023.427314987786}
Losses {'ner': 3025.0177057666133}
Losses {'ner': 3025.017927395812}
Losses {'ner': 3025.021234351357}
Losses {'ner': 3062.398866956119}
Losses {'ner': 3070.7138941623016}
Losses {'ner': 3083.479256623214}
Losses {'ner': 3083.4855695497013}
Losses {'ner': 3083.6785627843033}
Losses {'ner': 3091.6022748275846}
Losses {'ner': 3103.657826209738}
Lo

Losses {'ner': 2066.1333298999166}
Losses {'ner': 2068.5383137602657}
Losses {'ner': 2071.3310964546163}
Losses {'ner': 2081.341890110592}
Losses {'ner': 2097.091852534633}
Losses {'ner': 2110.3421471172405}
Losses {'ner': 2112.314241540347}
Losses {'ner': 2112.761402108143}
Losses {'ner': 2143.9451449916523}
Losses {'ner': 2198.721304998155}
Losses {'ner': 2232.6517253712736}
Losses {'ner': 2232.777062710625}
Losses {'ner': 2237.454094202693}
Losses {'ner': 2240.0422647792993}
Losses {'ner': 2240.042437524901}
Losses {'ner': 2241.3151579893356}
Losses {'ner': 2252.032852059397}
Losses {'ner': 2272.0831665927703}
Losses {'ner': 2284.346020284797}
Losses {'ner': 2300.8018931386887}
Losses {'ner': 2305.91831177956}
Losses {'ner': 2309.8750890447404}
Losses {'ner': 2328.9050974786674}
Losses {'ner': 2365.637078964564}
Losses {'ner': 2384.1232601825495}
Losses {'ner': 2422.357646219693}
Losses {'ner': 2475.9634909916804}
Losses {'ner': 2498.667056942811}
Losses {'ner': 2550.9445924857173}


Losses {'ner': 1303.885090248195}
Losses {'ner': 1313.009302492718}
Losses {'ner': 1338.6426710467101}
Losses {'ner': 1348.1682890088775}
Losses {'ner': 1506.8725869912203}
Losses {'ner': 1613.4384855070189}
Losses {'ner': 1638.8282799506217}
Losses {'ner': 1638.8901489873476}
Losses {'ner': 1661.0589686274259}
Losses {'ner': 1696.5370419847366}
Losses {'ner': 1704.2162731349558}
Losses {'ner': 1712.5530605203423}
Losses {'ner': 1721.0525259735298}
Losses {'ner': 1763.3862147024217}
Losses {'ner': 1781.3930752071858}
Losses {'ner': 1793.8398133218411}
Losses {'ner': 1795.1684530684374}
Losses {'ner': 1795.1749908589738}
Losses {'ner': 1795.175001837361}
Losses {'ner': 1841.2918260859003}
Losses {'ner': 1841.3084613763401}
Losses {'ner': 1848.1318820665103}
Losses {'ner': 1861.4452431127065}
Losses {'ner': 1864.6033910723763}
Losses {'ner': 1881.4721955202922}
Losses {'ner': 1903.5670494699348}
Losses {'ner': 1903.8260645328398}
Losses {'ner': 1954.3797056695025}
Losses {'ner': 1984.126

Losses {'ner': 430.5327539221805}
Losses {'ner': 434.94646722503046}
Losses {'ner': 434.94653472709797}
Losses {'ner': 445.04348982250696}
Losses {'ner': 456.88976883746597}
Losses {'ner': 490.1757523262991}
Losses {'ner': 490.32494891645007}
Losses {'ner': 490.3249495043424}
Losses {'ner': 494.5634051023167}
Losses {'ner': 494.56344411631875}
Losses {'ner': 494.563444116325}
Losses {'ner': 495.3876013763951}
Losses {'ner': 511.8127504358002}
Losses {'ner': 519.3501770754257}
Losses {'ner': 519.66724800639}
Losses {'ner': 538.7020169517137}
Losses {'ner': 561.3888594365683}
Losses {'ner': 608.6439440231334}
Losses {'ner': 720.7934348932225}
Losses {'ner': 722.5498172737638}
Losses {'ner': 730.6851909876489}
Losses {'ner': 733.8570386603787}
Losses {'ner': 764.4557751969348}
Losses {'ner': 821.6578686116151}
Losses {'ner': 852.9529797426977}
Losses {'ner': 855.0868347879428}
Losses {'ner': 879.4447941121243}
Losses {'ner': 895.2060183458235}
Losses {'ner': 897.8596845746305}
Losses {'ne

Losses {'ner': 3362.1497510617196}
Losses {'ner': 3368.6134430155917}
Losses {'ner': 3379.139772064665}
Losses {'ner': 3382.2289541069254}
Losses {'ner': 3387.5251110592544}
Losses {'ner': 3390.913445196242}
Losses {'ner': 3395.7326686377537}
Losses {'ner': 3418.3413588547137}
Losses {'ner': 3419.76329240969}
Losses {'ner': 3422.554347521118}
Statring iteration 12
Losses {'ner': 12.85289122564352}
Losses {'ner': 39.93655115092045}
Losses {'ner': 58.89323195951413}
Losses {'ner': 67.29758719142717}
Losses {'ner': 86.94038930606887}
Losses {'ner': 91.22792761315874}
Losses {'ner': 104.98380229031197}
Losses {'ner': 117.98608744983312}
Losses {'ner': 134.52681811943762}
Losses {'ner': 151.90651501528205}
Losses {'ner': 151.90651574917487}
Losses {'ner': 199.7336020754044}
Losses {'ner': 235.64527325708008}
Losses {'ner': 236.25659842754658}
Losses {'ner': 259.6046987770282}
Losses {'ner': 270.2497031853214}
Losses {'ner': 270.3210422591087}
Losses {'ner': 281.0460830409577}
Losses {'ner':

Losses {'ner': 2887.8188651516766}
Losses {'ner': 2889.295949981628}
Losses {'ner': 2890.3491958041272}
Losses {'ner': 2890.349249857512}
Losses {'ner': 2896.670216906722}
Losses {'ner': 2902.922765335962}
Losses {'ner': 2902.988806399759}
Losses {'ner': 2904.0760483655777}
Losses {'ner': 2912.2126613858177}
Losses {'ner': 2912.6490045534856}
Losses {'ner': 2919.227194916477}
Losses {'ner': 2919.2271949916512}
Losses {'ner': 2928.914933412096}
Losses {'ner': 2932.3111652358048}
Losses {'ner': 2944.9979585823553}
Losses {'ner': 2952.6157312635373}
Losses {'ner': 2954.6074728286767}
Losses {'ner': 2973.196174650731}
Losses {'ner': 2973.2236962797697}
Losses {'ner': 2997.0240316387317}
Losses {'ner': 2999.014832031369}
Losses {'ner': 3002.739380339747}
Losses {'ner': 3011.593087797548}
Losses {'ner': 3020.5872675509577}
Losses {'ner': 3024.9462132499357}
Losses {'ner': 3027.883449872157}
Losses {'ner': 3027.8844506261535}
Losses {'ner': 3038.6851743308453}
Losses {'ner': 3040.260378637058

Losses {'ner': 2418.185180159904}
Losses {'ner': 2418.185180160546}
Losses {'ner': 2425.103851536315}
Losses {'ner': 2429.5406984378924}
Losses {'ner': 2429.54160369265}
Losses {'ner': 2444.227754671287}
Losses {'ner': 2444.251526980144}
Losses {'ner': 2447.686657792171}
Losses {'ner': 2447.68673394795}
Losses {'ner': 2447.687106423556}
Losses {'ner': 2447.687137794115}
Losses {'ner': 2447.6873129992273}
Losses {'ner': 2447.6904776657834}
Losses {'ner': 2447.73473060655}
Losses {'ner': 2474.6565846985577}
Losses {'ner': 2482.058651031106}
Losses {'ner': 2496.632982116314}
Losses {'ner': 2502.4006625198945}
Losses {'ner': 2520.13870768186}
Losses {'ner': 2544.3948817381465}
Losses {'ner': 2551.7700208338656}
Losses {'ner': 2553.519248669356}
Losses {'ner': 2557.16122993765}
Losses {'ner': 2557.1790407160274}
Losses {'ner': 2562.4158623547455}
Losses {'ner': 2568.587363727608}
Losses {'ner': 2606.5702396708684}
Losses {'ner': 2620.538997113339}
Losses {'ner': 2627.8567317985344}
Losses {

Losses {'ner': 2030.372820856746}
Losses {'ner': 2033.501057002637}
Losses {'ner': 2042.8757709006752}
Losses {'ner': 2068.366119664682}
Losses {'ner': 2086.2400201203122}
Losses {'ner': 2096.4317981183067}
Losses {'ner': 2097.137908848101}
Losses {'ner': 2098.2105430135734}
Losses {'ner': 2098.210550922466}
Losses {'ner': 2098.214060940548}
Losses {'ner': 2098.2593223892172}
Losses {'ner': 2098.6456602203325}
Losses {'ner': 2098.6547267563415}
Losses {'ner': 2098.659815754432}
Losses {'ner': 2100.8439702041296}
Losses {'ner': 2109.912186152888}
Losses {'ner': 2110.152791594463}
Losses {'ner': 2110.2048596130285}
Losses {'ner': 2110.204927424158}
Losses {'ner': 2110.2329282465817}
Losses {'ner': 2110.888500813686}
Losses {'ner': 2119.241020087159}
Losses {'ner': 2119.315795159196}
Losses {'ner': 2119.315815945927}
Losses {'ner': 2119.3158163398816}
Losses {'ner': 2130.641902530868}
Losses {'ner': 2130.642123454939}
Losses {'ner': 2131.5639459201516}
Losses {'ner': 2131.6943730145485}
L

Losses {'ner': 1510.7737526335882}
Losses {'ner': 1515.278682694426}
Losses {'ner': 1523.2304579140277}
Losses {'ner': 1523.2556349662461}
Losses {'ner': 1523.8433580508176}
Losses {'ner': 1528.2503732887155}
Losses {'ner': 1528.502995483149}
Losses {'ner': 1528.5196459971812}
Losses {'ner': 1536.2358621671929}
Losses {'ner': 1557.337276540348}
Losses {'ner': 1561.269935859439}
Losses {'ner': 1571.9870810110997}
Losses {'ner': 1571.9873076128213}
Losses {'ner': 1572.0156602203522}
Losses {'ner': 1578.9134952282789}
Losses {'ner': 1592.8968473075963}
Losses {'ner': 1607.2805854355038}
Losses {'ner': 1607.2843884759088}
Losses {'ner': 1607.303868822064}
Losses {'ner': 1644.5748984976365}
Losses {'ner': 1697.7682946814007}
Losses {'ner': 1717.7242815525208}
Losses {'ner': 1720.5883075951006}
Losses {'ner': 1723.1417001564316}
Losses {'ner': 1727.116668497454}
Losses {'ner': 1728.4303384643745}
Losses {'ner': 1729.9587956897767}
Losses {'ner': 1742.4602701228594}
Losses {'ner': 1745.513654

Losses {'ner': 730.8278757960256}
Losses {'ner': 770.0820568547192}
Losses {'ner': 782.0060428161266}
Losses {'ner': 816.8040745611466}
Losses {'ner': 840.3502483987386}
Losses {'ner': 850.1911149619377}
Losses {'ner': 851.3206229246247}
Losses {'ner': 859.4742238916069}
Losses {'ner': 860.2099797306213}
Losses {'ner': 860.8850670303918}
Losses {'ner': 864.5644718944496}
Losses {'ner': 864.7663637862319}
Losses {'ner': 864.7902622958123}
Losses {'ner': 868.4720995248352}
Losses {'ner': 879.8851412213818}
Losses {'ner': 885.2375081786696}
Losses {'ner': 1035.5786325343106}
Losses {'ner': 1117.7574691515188}
Losses {'ner': 1141.4788737282977}
Losses {'ner': 1143.4853148184238}
Losses {'ner': 1154.219336958066}
Losses {'ner': 1185.7174670360596}
Losses {'ner': 1197.0200744901995}
Losses {'ner': 1209.3428668129625}
Losses {'ner': 1215.3103885605253}
Losses {'ner': 1236.3891235111728}
Losses {'ner': 1248.3440333254543}
Losses {'ner': 1253.657257232063}
Losses {'ner': 1255.3381763252332}
Los

Losses {'ner': 222.87232776720165}
Losses {'ner': 224.7407863632905}
Losses {'ner': 236.75013900741143}
Losses {'ner': 258.4246452044416}
Losses {'ner': 260.69885012944}
Losses {'ner': 277.8944266036202}
Losses {'ner': 282.0147343084248}
Losses {'ner': 283.6707063753476}
Losses {'ner': 286.4042039454871}
Losses {'ner': 289.3593776102396}
Losses {'ner': 289.359506935211}
Losses {'ner': 289.359506955467}
Losses {'ner': 289.35950720090227}
Losses {'ner': 296.3192602512522}
Losses {'ner': 296.3192602701674}
Losses {'ner': 304.28476526900755}
Losses {'ner': 312.3282641328212}
Losses {'ner': 335.51409854205406}
Losses {'ner': 335.6353342091093}
Losses {'ner': 335.63533420911625}
Losses {'ner': 338.3736932702859}
Losses {'ner': 338.3736945583802}
Losses {'ner': 338.3736945583813}
Losses {'ner': 338.7459441904103}
Losses {'ner': 346.6682503675022}
Losses {'ner': 352.44104396173276}
Losses {'ner': 354.41310994227075}
Losses {'ner': 363.57674521588024}
Losses {'ner': 372.8377845015011}
Losses {'

Losses {'ner': 2393.9276041712387}
Losses {'ner': 2411.569816356419}
Losses {'ner': 2411.5843739270276}
Losses {'ner': 2416.0858274134116}
Losses {'ner': 2416.0858295732305}
Losses {'ner': 2425.9732739448064}
Losses {'ner': 2427.340267995439}
Losses {'ner': 2444.640386096258}
Losses {'ner': 2451.915814404004}
Losses {'ner': 2453.9313624727283}
Losses {'ner': 2459.519170938931}
Losses {'ner': 2460.011023486438}
Losses {'ner': 2461.1336256719865}
Losses {'ner': 2465.3876685591504}
Losses {'ner': 2478.4112267129153}
Losses {'ner': 2482.434230728316}
Losses {'ner': 2482.4417379116812}
Losses {'ner': 2482.4432302038713}
Losses {'ner': 2485.582874204151}
Losses {'ner': 2505.6241199541355}
Losses {'ner': 2508.933784500595}
Losses {'ner': 2509.1357825725568}
Statring iteration 18
Losses {'ner': 12.845374893588925}
Losses {'ner': 21.124155909500082}
Losses {'ner': 32.25590091826171}
Losses {'ner': 39.084600631910654}
Losses {'ner': 66.34199366599384}
Losses {'ner': 70.29767017216278}
Losses {'n

Losses {'ner': 2080.9082230188433}
Losses {'ner': 2082.7758937390245}
Losses {'ner': 2094.747489923205}
Losses {'ner': 2095.0241132193014}
Losses {'ner': 2110.9352001038806}
Losses {'ner': 2121.063201566356}
Losses {'ner': 2123.242412814212}
Losses {'ner': 2127.4926253805265}
Losses {'ner': 2130.218840193404}
Losses {'ner': 2131.793313171446}
Losses {'ner': 2135.712316455544}
Losses {'ner': 2146.8887558105303}
Losses {'ner': 2147.5868899377883}
Losses {'ner': 2150.8382017543395}
Losses {'ner': 2150.935600835346}
Losses {'ner': 2151.27416858787}
Losses {'ner': 2153.3336115395196}
Losses {'ner': 2156.5221710297324}
Losses {'ner': 2158.6333270732184}
Losses {'ner': 2161.441228430973}
Losses {'ner': 2178.5955822162873}
Losses {'ner': 2178.697403789216}
Losses {'ner': 2182.8677655208053}
Losses {'ner': 2182.8677655464376}
Losses {'ner': 2185.9010330526403}
Losses {'ner': 2186.9944871351713}
Losses {'ner': 2204.7466724443307}
Losses {'ner': 2210.5744107306696}
Losses {'ner': 2212.66221732316

Losses {'ner': 1741.7896025587968}
Losses {'ner': 1741.7969495836912}
Losses {'ner': 1741.8840840523953}
Losses {'ner': 1741.904301255108}
Losses {'ner': 1763.8543262836238}
Losses {'ner': 1772.2635944861026}
Losses {'ner': 1784.1990045214482}
Losses {'ner': 1784.2109672483361}
Losses {'ner': 1784.4665228923345}
Losses {'ner': 1787.070072002353}
Losses {'ner': 1797.7219854281134}
Losses {'ner': 1797.7553292960413}
Losses {'ner': 1797.7716204396374}
Losses {'ner': 1797.7716451671188}
Losses {'ner': 1801.770104419793}
Losses {'ner': 1811.0520650205158}
Losses {'ner': 1811.0631261008718}
Losses {'ner': 1825.542574839994}
Losses {'ner': 1826.0735175350055}
Losses {'ner': 1826.0740119632383}
Losses {'ner': 1826.0784654231986}
Losses {'ner': 1826.0784717164888}
Losses {'ner': 1826.0784834972517}
Losses {'ner': 1826.0784843985443}
Losses {'ner': 1826.0784855275106}
Losses {'ner': 1826.087461349909}
Losses {'ner': 1842.0708160002632}
Losses {'ner': 1848.3532463222593}
Losses {'ner': 1857.44985

### <span style="color:#FF00FF">Model Evaluation</span>

In [7]:
TEST_DATA = df[['clean_description','tags']].sample(frac=1).reset_index(drop=True)

In [8]:
def get_predvalues(pred,text):
    res = "O"
    for i in pred:
        if i[1] == text:
            res = "B-"+i[2]
    return res

test = []
for i, j in TEST_DATA.iterrows():
    annots = ast.literal_eval(j[1])
    text   = j[0]
    pred = []    
    doc = jdnlp(text)
    for ent in doc.ents:
        pred.append((i,ent.text, ent.label_))
        
        
    for k in annots.get('entities'):
        st, ed = k[0],k[1]
        txt = text[st:ed]
        tag = get_predvalues(pred,txt)
        test.append((i,txt,"B-"+k[2],tag))

In [9]:
cols = ["sentence_id","text","actual","pred"]
tdf = pd.DataFrame(test,columns=cols)

y_test = tdf["actual"].tolist()
y_pred = tdf["pred"].tolist()

y_test = [[ind] for ind in y_test]
y_pred = [[ind] for ind in y_pred]

In [10]:
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred) 
recall =  recall_score(y_test,y_pred)
f1score = f1_score(y_test,y_pred)

SPACY_metrics = {
    "F1-score" : f1score,
    "Accuracy" : accuracy,
    "Precision" : precision,
    "Recall" : recall
}
print(SPACY_metrics)

{'F1-score': 0.9809441731303264, 'Accuracy': 0.9626914763650033, 'Precision': 0.9999023914104441, 'Recall': 0.9626914763650033}


### <span style="color:#FF00FF">Classification report</span>

In [11]:
class_report = classification_report(y_test,y_pred, digits=4)
print(class_report)

              precision    recall  f1-score   support

        DEPT     1.0000    0.9374    0.9677      2748
        ROLE     0.9995    0.9781    0.9887      1965
       SKILL     1.0000    0.9693    0.9844      5928

   micro avg     0.9999    0.9627    0.9809     10641
   macro avg     0.9998    0.9616    0.9803     10641
weighted avg     0.9999    0.9627    0.9809     10641



### <span style="color:#FF00FF">Save Model and metrics</span>

In [12]:
SPACY_objects = {
    "spacy_metrics" : SPACY_metrics,
    "spacy_model" : jdnlp
}

pickle_out = open( "../models/SPACY_objects.pkl", "wb" )
pickle.dump(SPACY_objects, pickle_out)

### <span style="color:#FF00FF">Load Model and Test</span>

In [13]:
pickle_in = open("../models/SPACY_objects.pkl", "rb" )
SPACY_obj = pickle.load(pickle_in)

### <span style="color:#FF00FF">Predict on new JD</span>

In [14]:
jdnlp = SPACY_obj.get("spacy_model")

# Testing the model
JD = """description scope role piramal pharma solutions pps chief information officer cio role provide vision leadership developing implementing information technology initiatives align vision piramal pharma solutions pps businesses pps cio charter build competitive edge business proactively building world class high quality innovative technology digital analytics solutions global operations job overview strategy innovation strategic business partner create enhanced digital technology vision enterprise identify opportunities differentiated technology capabilities solutions p roactively recommend solutions business functional leadership team considering business vision industry trends bringing outside perspective p ush bar technology innovation imbibing cutting edge technological innovations global benchmarks blue sky thinking create user friendly technologies offering great experience cts champion change agent accelerating organizational changes required create sustain enterprise technology capabilities cts thought leader emerging digital business models technologies articulating digital future enterprise role internally externally enable business growth enable inorganic business growth merger acquisition leading due diligence driving integration post acquisition lead strategic operational planning implementation achieve business goals fostering innovation prioritizing initiatives coordinating evaluation deployment management current future systems across organization coordinate facilitate consultation relevant business stakeholders define business systems requirements new technology implementations planning execution partner various site ho teams manage project portfolio relate selection acquisition development implementation major information systems defines governance mechanism metrics review progress technology projects business case achievement technology budget company provide upfront estimates costs various heads keep track spends ensure best roi company technology investment maintaining balance frugality financial discipline adequately gearing future growth b uild ecosystem group technology teams partners including startups product vendors develop implement technology solutions b uild future read technology team attracting retaining upskilling industry best talent compliance information security collaborate piramal quality e compliance qec team ensure quality compliance per defined sops guidelines accordance 21 cfr part 11 gamp guidelines collaborate information security team ensure adherence information security guidelines processes skills abilities exceptional inter personal skills enabling engagement levels across leadership skills including ability manage large team understanding strategy business technology application levels environment priorities goals quickly change evolve also skills think strategically including developing information security strategies interpreting handling complex information acting political sensitivity driving engaging positively change sound understanding portfolio program project management track record delivering enabling large scale complex change programs qualifications experience delivering strategy delivery essential across multiple organisations desirable experience working pharma business senior position essential experience procuring managing large complex outcome based contracts interdependencies experience working senior management team develop business focussed strategies effectively support business needs experience technologies sap salesforce bi pharma quality applications preferred experience digital technologies ar vr rpa chatbots ai ml etc preferred experience joint procurement market testing outsourcing well negotiating quality cost effective services experience successfully implementing strategy business planning evidence delivering high quality customer focussed services experience contributing development implementation effective management information systems aid decision making process evidence contribution major transformation building teams time change"""

doc = jdnlp(JD)
print("Entities", set([(ent.text, ent.label_) for ent in doc.ents]))

colors = {"SKILL":"#FFF380","ROLE":"#8EEBEC","DEPT":"#E7A1B0"}
options = {'ents': ['SKILL', 'ROLE','DEPT'], 'colors':colors}
displacy.render(doc,style='ent',jupyter=True, options=options)

Entities {('leadership', 'ROLE'), ('track record', 'SKILL'), ('lead', 'ROLE'), ('compliance', 'ROLE'), ('deployment', 'SKILL'), ('pharma', 'DEPT'), ('application', 'SKILL'), ('information systems', 'SKILL'), ('bi', 'SKILL'), ('information security', 'DEPT'), ('senior management', 'ROLE'), ('support', 'SKILL'), ('sap', 'SKILL'), ('salesforce', 'SKILL'), ('applications', 'SKILL'), ('development', 'SKILL'), ('information technology', 'DEPT'), ('operations', 'DEPT'), ('developing', 'SKILL'), ('testing', 'SKILL'), ('project management', 'ROLE'), ('decision making', 'SKILL'), ('budget', 'SKILL'), ('analytics', 'DEPT')}
