### <span style="color:#800000">NER with BERT</span>

### <span style="color:#FF00FF">Import libraries</span>

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from seqeval.metrics import accuracy_score,precision_score, recall_score, f1_score, classification_report
from simpletransformers.ner import NERModel,NERArgs

### <span style="color:#FF00FF">Load the dataset</span>

In [2]:
df = pd.read_csv("../data/jobdescriptions_input.csv")

df.head()

Unnamed: 0,sentence_id,words,labels
0,0,manage,O
1,0,verticals,O
2,0,company,O
3,0,like,O
4,0,applications,I-SKILL


### <span style="color:#FF00FF">Number of unique words in the corpus and tags</span>

In [3]:
custom_labels = df["labels"].unique().tolist()
custom_labels

['O', 'I-SKILL', 'I-DEPT', 'I-ROLE', 'B-ROLE', 'B-SKILL', 'B-DEPT']

In [4]:
print("Unique words in corpus:", df['words'].nunique())
print("Unique tags in corpus:", df['labels'].nunique())

Unique words in corpus: 10165
Unique tags in corpus: 7


### <span style="color:#FF00FF">Split train and test data</span>

In [5]:
X = df[["sentence_id","words"]]
y = df["labels"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2)

In [7]:
#building up train data and test data
train_data = pd.DataFrame({"sentence_id":X_train["sentence_id"],"words":X_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":X_test["sentence_id"],"words":X_test["words"],"labels":y_test})

### <span style="color:#FF00FF">Build and Train BERT model</span>

In [8]:
args = NERArgs()
args.num_train_epochs = 5
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.reprocess_input_data = True
args.train_batch_size = 32
args.eval_batch_size = 32

In [9]:
model = NERModel('bert', 'bert-base-cased',labels=custom_labels,args =args,use_cuda=False)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [10]:
model.train_model(train_data,eval_data = test_data,acc=accuracy_score)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=36.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=36.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=36.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=36.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=36.0, style=ProgressStyle(desc…





(180, 0.23168888203799726)

### <span style="color:#FF00FF">Model Evaluation</span>

In [11]:
result, model_outputs, wrong_predictions = model.eval_model(test_data)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=36.0, style=ProgressStyle(descri…




In [12]:
y_preds, _ = model.predict(test_data['words'])

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Prediction', max=956.0, style=ProgressStyle(descr…




In [13]:
y_test = [[ind] for ind in y_test]
y_pred = [list(idx.values()) for sub in y_preds for idx in sub]

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred) 
recall =  recall_score(y_test,y_pred)
f1score = f1_score(y_test,y_pred)

BERT_metrics = {
    "F1-score" : f1score,
    "Accuracy" : accuracy,
    "Precision" : precision,
    "Recall" : recall
}
print(BERT_metrics)

{'F1-score': 0.5599388379204893, 'Accuracy': 0.9104912028255608, 'Precision': 0.4874866879659212, 'Recall': 0.6576867816091954}


### <span style="color:#FF00FF">Classification report</span>

In [14]:
class_report = classification_report(y_test,y_pred, digits=4)
print(class_report)

              precision    recall  f1-score   support

        DEPT     0.4036    0.7191    0.5170       687
        ROLE     0.4690    0.4854    0.4771       515
       SKILL     0.5438    0.6871    0.6071      1582

   micro avg     0.4875    0.6577    0.5599      2784
   macro avg     0.4721    0.6305    0.5337      2784
weighted avg     0.4954    0.6577    0.5608      2784



In [15]:
pd.Series(np.array(y_pred).flatten()).value_counts()

O          26822
I-SKILL     1758
I-DEPT      1224
I-ROLE       529
B-SKILL      241
B-ROLE         4
dtype: int64

### <span style="color:#FF00FF">Predict on new JD</span>

In [16]:
JD="description scope role piramal pharma solutions pps chief information officer cio role provide vision leadership developing implementing information technology initiatives align vision piramal pharma solutions pps businesses pps cio charter build competitive edge business proactively building world class high quality innovative technology digital analytics solutions global operations job overview strategy innovation strategic business partner create enhanced digital technology vision enterprise identify opportunities differentiated technology capabilities solutions p roactively recommend solutions business functional leadership team considering business vision industry trends bringing outside perspective p ush bar technology innovation imbibing cutting edge technological innovations global benchmarks blue sky thinking create user friendly technologies offering great experience cts champion change agent accelerating organizational changes required create sustain enterprise technology capabilities cts thought leader emerging digital business models technologies articulating digital future enterprise role internally externally enable business growth enable inorganic business growth merger acquisition leading due diligence driving integration post acquisition lead strategic operational planning implementation achieve business goals fostering innovation prioritizing initiatives coordinating evaluation deployment management current future systems across organization coordinate facilitate consultation relevant business stakeholders define business systems requirements new technology implementations planning execution partner various site ho teams manage project portfolio relate selection acquisition development implementation major information systems defines governance mechanism metrics review progress technology projects business case achievement technology budget company provide upfront estimates costs various heads keep track spends ensure best roi company technology investment maintaining balance frugality financial discipline adequately gearing future growth b uild ecosystem group technology teams partners including startups product vendors develop implement technology solutions b uild future read technology team attracting retaining upskilling industry best talent compliance information security collaborate piramal quality e compliance qec team ensure quality compliance per defined sops guidelines accordance 21 cfr part 11 gamp guidelines collaborate information security team ensure adherence information security guidelines processes skills abilities exceptional inter personal skills enabling engagement levels across leadership skills including ability manage large team understanding strategy business technology application levels environment priorities goals quickly change evolve also skills think strategically including developing information security strategies interpreting handling complex information acting political sensitivity driving engaging positively change sound understanding portfolio program project management track record delivering enabling large scale complex change programs qualifications experience delivering strategy delivery essential across multiple organisations desirable experience working pharma business senior position essential experience procuring managing large complex outcome based contracts interdependencies experience working senior management team develop business focussed strategies effectively support business needs experience technologies sap salesforce bi pharma quality applications preferred experience digital technologies ar vr rpa chatbots ai ml etc preferred experience joint procurement market testing outsourcing well negotiating quality cost effective services experience successfully implementing strategy business planning evidence delivering high quality customer focussed services experience contributing development implementation effective management information systems aid decision making process evidence contribution major transformation building teams time change"
prediction, model_output = model.predict([JD])

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Prediction', max=1.0, style=ProgressStyle(descrip…




In [17]:
d =  list(np.concatenate(prediction).flat)

print("{:15}||{}".format("Word", "Prediction"))
print(30 * "=")
for k in d:
    for h,n in k.items():
      if n != "O":
        print("{:15}: {:5}".format(h, n))

Word           ||Prediction
pharma         : I-DEPT
information    : B-DEPT
leadership     : I-ROLE
developing     : I-SKILL
information    : B-DEPT
pharma         : I-DEPT
analytics      : I-DEPT
operations     : I-DEPT
functional     : I-SKILL
leadership     : I-ROLE


### <span style="color:#FF00FF">Save Model and metrics</span>

In [18]:
BERT_objects = {
    "bert_metrics" : BERT_metrics,
    "bert_model" : model
}

pickle_out = open( "../models/BERT_objects.pkl", "wb" )
pickle.dump(BERT_objects, pickle_out)

### <span style="color:#FF00FF">Load Model and Test</span>

In [19]:
pickle_in = open("../models/BERT_objects.pkl", "rb" )
BERT_obj = pickle.load(pickle_in)

bert_model = BERT_obj.get("bert_model")

In [20]:
JD="description scope role piramal pharma solutions pps chief information officer cio role provide vision leadership developing implementing information technology initiatives align vision piramal pharma solutions pps businesses pps cio charter build competitive edge business proactively building world class high quality innovative technology digital analytics solutions global operations job overview strategy innovation strategic business partner create enhanced digital technology vision enterprise identify opportunities differentiated technology capabilities solutions p roactively recommend solutions business functional leadership team considering business vision industry trends bringing outside perspective p ush bar technology innovation imbibing cutting edge technological innovations global benchmarks blue sky thinking create user friendly technologies offering great experience cts champion change agent accelerating organizational changes required create sustain enterprise technology capabilities cts thought leader emerging digital business models technologies articulating digital future enterprise role internally externally enable business growth enable inorganic business growth merger acquisition leading due diligence driving integration post acquisition lead strategic operational planning implementation achieve business goals fostering innovation prioritizing initiatives coordinating evaluation deployment management current future systems across organization coordinate facilitate consultation relevant business stakeholders define business systems requirements new technology implementations planning execution partner various site ho teams manage project portfolio relate selection acquisition development implementation major information systems defines governance mechanism metrics review progress technology projects business case achievement technology budget company provide upfront estimates costs various heads keep track spends ensure best roi company technology investment maintaining balance frugality financial discipline adequately gearing future growth b uild ecosystem group technology teams partners including startups product vendors develop implement technology solutions b uild future read technology team attracting retaining upskilling industry best talent compliance information security collaborate piramal quality e compliance qec team ensure quality compliance per defined sops guidelines accordance 21 cfr part 11 gamp guidelines collaborate information security team ensure adherence information security guidelines processes skills abilities exceptional inter personal skills enabling engagement levels across leadership skills including ability manage large team understanding strategy business technology application levels environment priorities goals quickly change evolve also skills think strategically including developing information security strategies interpreting handling complex information acting political sensitivity driving engaging positively change sound understanding portfolio program project management track record delivering enabling large scale complex change programs qualifications experience delivering strategy delivery essential across multiple organisations desirable experience working pharma business senior position essential experience procuring managing large complex outcome based contracts interdependencies experience working senior management team develop business focussed strategies effectively support business needs experience technologies sap salesforce bi pharma quality applications preferred experience digital technologies ar vr rpa chatbots ai ml etc preferred experience joint procurement market testing outsourcing well negotiating quality cost effective services experience successfully implementing strategy business planning evidence delivering high quality customer focussed services experience contributing development implementation effective management information systems aid decision making process evidence contribution major transformation building teams time change"
prediction, model_output = bert_model.predict([JD])

d =  list(np.concatenate(prediction).flat)

print("{:15}||{}".format("Word", "Prediction"))
print(30 * "=")
for k in d:
    for h,n in k.items():
      if n != "O":
        print("{:15}: {:5}".format(h, n))


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Prediction', max=1.0, style=ProgressStyle(descrip…


Word           ||Prediction
pharma         : I-DEPT
information    : B-DEPT
leadership     : I-ROLE
developing     : I-SKILL
information    : B-DEPT
pharma         : I-DEPT
analytics      : I-DEPT
operations     : I-DEPT
functional     : I-SKILL
leadership     : I-ROLE
