## NAME ENTITY RECOGNITION - CASE STUDY

### DATA ANALYSIS

In [1]:
import pandas as pd
import numpy as np

In [2]:
### Load the dataset
data = pd.read_csv('ner_dataset.csv',encoding='Windows-1252')

In [3]:
### No. of records
data.shape

(1048575, 4)

In [4]:
### we are concerned with word and tag columns only so we filter it out
data = data[['Word','Tag']]

In [5]:
### No. of null records
data.isnull().sum()

Word    10
Tag      0
dtype: int64

In [6]:
### Null records doesn't have any word associated
data[data['Word'].isna()]

Unnamed: 0,Word,Tag
197658,,O
256026,,O
257069,,O
571211,,O
613777,,O
747019,,O
901758,,O
903054,,O
944880,,O
1003438,,O


In [7]:
### So we can safely drop these records
data = data.dropna().reset_index(drop=True)

In [8]:
### Delving to granular level
data_tag_count = pd.DataFrame(data.Tag.value_counts()).reset_index()
data_tag_count['percentage'] = data_tag_count['count'].apply(lambda x: (x/sum(data_tag_count['count']))*100)

In [9]:
### Distribution of tags in data
data_tag_count

Unnamed: 0,Tag,count,percentage
0,O,887898,84.67744
1,B-geo,37644,3.590049
2,B-tim,20333,1.939126
3,B-org,20143,1.921006
4,I-per,17251,1.645201
5,B-per,16990,1.62031
6,I-org,16784,1.600664
7,B-gpe,15870,1.513497
8,I-geo,7414,0.707062
9,I-tim,6528,0.622565


In [10]:
### Distribution of Individual tag corresponding to the words
def data_analysis(data):
    for i in data_tag_count.Tag.values:
        print(i)
        print('**************************************************************************')
        data_tag__count = pd.DataFrame(data[data.Tag==i]['Word'].value_counts()).reset_index()
        data_tag__count['percentage'] = data_tag__count['count'].apply(lambda x: (x/sum(data_tag__count['count']))*100)
        print(data_tag__count[:30])
        print('**************************************************************************')
    return

In [11]:
data_analysis(data=data)

O
**************************************************************************
    Word  count  percentage
0    the  52342    5.895047
1      .  47761    5.379109
2      ,  32476    3.657627
3     in  26199    2.950677
4     of  25118    2.828929
5     to  23095    2.601087
6      a  20479    2.306459
7    and  19228    2.165564
8    The  11031    1.242372
9     's  10530    1.185947
10   for   8311    0.936031
11   has   7216    0.812706
12    is   6746    0.759772
13    on   6706    0.755267
14  that   6246    0.703459
15  have   5485    0.617751
16  with   5380    0.605925
17  said   5321    0.599281
18   was   4878    0.549387
19  says   4640    0.522583
20    by   4482    0.504788
21  from   4441    0.500170
22    at   4343    0.489133
23   say   4178    0.470550
24    as   4106    0.462441
25    an   3845    0.433045
26   are   3718    0.418742
27     "   3610    0.406578
28  were   3519    0.396329
29  will   3400    0.382927
*******************************************************

In [12]:
### Distribution of word accross multiple tags
data.groupby(['Word']).agg({'Tag':'nunique'}).reset_index(drop=False).sort_values(by='Tag',ascending=False)[:30]

Unnamed: 0,Word,Tag
27699,of,11
11256,New,10
22,-,10
11612,Olympics,10
11839,Pakistan,10
5853,Egypt,9
8880,Katrina,9
12012,Peace,9
10909,Muslim,9
4658,Christmas,9


In [13]:
### Distribution of top 20 words where multiple tag is present
for i in data.groupby(['Word']).agg({'Tag':'nunique'}).reset_index(drop=False).sort_values(by='Tag',ascending=False)['Word'][:20].values:
    print(i)
    print('**************************************************************************')
    print(data[data.Word==i].drop_duplicates().reset_index(drop=True))
    print('**************************************************************************')

of
**************************************************************************
   Word    Tag
0    of      O
1    of  I-org
2    of  I-tim
3    of  I-art
4    of  I-gpe
5    of  B-tim
6    of  I-geo
7    of  I-per
8    of  B-geo
9    of  I-eve
10   of  B-org
**************************************************************************
New
**************************************************************************
  Word    Tag
0  New  B-geo
1  New  I-geo
2  New  B-tim
3  New  B-org
4  New  I-org
5  New  B-eve
6  New      O
7  New  B-gpe
8  New  I-art
9  New  I-tim
**************************************************************************
-
**************************************************************************
  Word    Tag
0    -      O
1    -  I-tim
2    -  B-org
3    -  I-geo
4    -  I-org
5    -  I-per
6    -  B-tim
7    -  B-geo
8    -  B-per
9    -  I-art
**************************************************************************
Olympics
********************************************

### BASELINE MODEL - LINEAR AND TREE MODELS

In [14]:
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.preprocessing import *
from nltk.stem import WordNetLemmatizer

In [15]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
### Drop the duplicates records as it won't be useful in learning any new information
data = data.drop_duplicates().reset_index(drop=True)

In [17]:
### Preprocess the data by converting it into lower caps
data['Word'] = data['Word'].str.lower()

In [18]:
### Special characters and stop words cannot be removed as some of them are associated with tags and we don't want to lose it

In [19]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [20]:
### Lemmatize the data
lemmatizer = WordNetLemmatizer()
data['Word'] = data['Word'].apply(lambda x: lemmatizer.lemmatize(x))

In [21]:
### One hot encode the words to convert it into numbers
ohe = pd.get_dummies(data['Word'],prefix='Word')

In [22]:
### Concat the ohe record with the main data
data = pd.concat([data,ohe],axis=1).drop_duplicates().reset_index(drop=True)

In [23]:
data.shape

(38631, 29007)

In [25]:
### Split the dataset into train, test and validation set
train, test = train_test_split(data.drop(columns=['Word']), test_size=0.3, random_state=10, stratify=data.Tag)
test, val = train_test_split(test, test_size=0.3, random_state=11, stratify=test.Tag)

In [26]:
print("Train data:",train.shape, "\nTest data:",test.shape, "\nValidation data:",val.shape)

Train data: (27041, 29006) 
Test data: (8113, 29006) 
Validation data: (3477, 29006)


In [27]:
X_train, y_train = train.drop(columns=['Tag']), train.Tag
X_test, y_test = test.drop(columns=['Tag']), test.Tag
X_val, y_val = val.drop(columns=['Tag']), val.Tag

In [28]:
print(X_train.shape, y_train.shape)

(27041, 29005) (27041,)


In [29]:
### Logistic Regression

In [30]:
lr_model = LogisticRegression(C=0.3,max_iter=5,class_weight='balanced')
lr_model1 = lr_model.fit(X_train,y_train)
y_train_pred = lr_model1.predict(X_train)
y_val_pred = lr_model1.predict(X_val)
print(f1_score(y_train,y_train_pred,average='weighted'))
print(f1_score(y_val,y_val_pred,average='weighted'))

0.05661021983882371
0.00015456589546751726


In [31]:
print(classification_report(y_train,lr_model1.predict(X_train)))

              precision    recall  f1-score   support

       B-art       0.42      0.90      0.57       193
       B-eve       0.33      0.82      0.47        71
       B-geo       0.00      0.00      0.00      2098
       B-gpe       0.01      0.89      0.02       256
       B-nat       0.41      1.00      0.58        27
       B-org       0.00      0.00      0.00      2049
       B-per       0.00      0.00      0.00      2094
       B-tim       0.65      0.72      0.68       747
       I-art       0.39      0.91      0.54       155
       I-eve       0.30      0.98      0.46        57
       I-geo       0.48      0.82      0.60       743
       I-gpe       0.25      1.00      0.40        28
       I-nat       0.41      1.00      0.58        13
       I-org       0.00      0.00      0.00      2015
       I-per       0.00      0.00      0.00      2767
       I-tim       0.44      0.90      0.59       475
           O       0.00      0.00      0.00     13253

    accuracy              

In [32]:
print(classification_report(y_val,lr_model1.predict(X_val)))

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        25
       B-eve       0.00      0.00      0.00         9
       B-geo       0.00      0.00      0.00       270
       B-gpe       0.01      0.79      0.02        33
       B-nat       0.00      0.00      0.00         3
       B-org       0.00      0.00      0.00       263
       B-per       0.00      0.00      0.00       269
       B-tim       0.00      0.00      0.00        96
       I-art       0.00      0.00      0.00        20
       I-eve       0.00      0.00      0.00         8
       I-geo       0.00      0.00      0.00        95
       I-gpe       0.00      0.00      0.00         4
       I-nat       0.00      0.00      0.00         2
       I-org       0.00      0.00      0.00       259
       I-per       0.00      0.00      0.00       356
       I-tim       0.00      0.00      0.00        61
           O       0.00      0.00      0.00      1704

    accuracy              

In [33]:
print(classification_report(y_test,lr_model1.predict(X_test)))

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        58
       B-eve       0.00      0.00      0.00        22
       B-geo       0.00      0.00      0.00       630
       B-gpe       0.01      0.66      0.01        77
       B-nat       0.00      0.00      0.00         8
       B-org       0.00      0.00      0.00       615
       B-per       0.00      0.00      0.00       628
       B-tim       0.00      0.00      0.00       224
       I-art       0.00      0.00      0.00        47
       I-eve       0.00      0.00      0.00        17
       I-geo       0.00      0.00      0.00       223
       I-gpe       0.00      0.00      0.00         8
       I-nat       0.00      0.00      0.00         4
       I-org       0.00      0.00      0.00       604
       I-per       0.00      0.00      0.00       830
       I-tim       0.00      0.00      0.00       142
           O       0.00      0.00      0.00      3976

    accuracy              

In [34]:
### Random forest model

In [35]:
rf_model = RandomForestClassifier(n_jobs=-1, bootstrap=True, oob_score=True, random_state=5, class_weight='balanced',
                                  n_estimators=100, max_depth=10, min_samples_split=10)
rf_model1 = rf_model.fit(X_train,y_train)
y_train_pred = rf_model1.predict(X_train)
y_val_pred = rf_model1.predict(X_val)
print(f1_score(y_train,y_train_pred,average='weighted'))
print(f1_score(y_val,y_val_pred,average='weighted'))

0.03098117435218505
0.011335298800703714


In [36]:
print(classification_report(y_train,rf_model1.predict(X_train)))

              precision    recall  f1-score   support

       B-art       0.40      0.51      0.45       193
       B-eve       0.31      0.70      0.43        71
       B-geo       0.00      0.00      0.00      2098
       B-gpe       0.40      0.38      0.39       256
       B-nat       0.41      0.96      0.58        27
       B-org       0.00      0.00      0.00      2049
       B-per       0.08      0.97      0.15      2094
       B-tim       0.64      0.03      0.06       747
       I-art       0.43      0.55      0.49       155
       I-eve       0.30      0.93      0.45        57
       I-geo       0.51      0.03      0.06       743
       I-gpe       0.25      0.96      0.39        28
       I-nat       0.41      1.00      0.58        13
       I-org       0.00      0.00      0.00      2015
       I-per       0.00      0.00      0.00      2767
       I-tim       0.42      0.11      0.18       475
           O       0.00      0.00      0.00     13253

    accuracy              

In [37]:
print(classification_report(y_val,rf_model1.predict(X_val)))

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        25
       B-eve       0.00      0.00      0.00         9
       B-geo       0.00      0.00      0.00       270
       B-gpe       0.00      0.00      0.00        33
       B-nat       0.00      0.00      0.00         3
       B-org       0.00      0.00      0.00       263
       B-per       0.08      0.99      0.15       269
       B-tim       0.00      0.00      0.00        96
       I-art       0.00      0.00      0.00        20
       I-eve       0.00      0.00      0.00         8
       I-geo       0.00      0.00      0.00        95
       I-gpe       0.00      0.00      0.00         4
       I-nat       0.00      0.00      0.00         2
       I-org       0.00      0.00      0.00       259
       I-per       0.00      0.00      0.00       356
       I-tim       0.00      0.00      0.00        61
           O       0.00      0.00      0.00      1704

    accuracy              

In [38]:
print(classification_report(y_test,rf_model1.predict(X_test)))

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        58
       B-eve       0.00      0.00      0.00        22
       B-geo       0.00      0.00      0.00       630
       B-gpe       0.00      0.00      0.00        77
       B-nat       0.00      0.00      0.00         8
       B-org       0.00      0.00      0.00       615
       B-per       0.08      0.97      0.14       628
       B-tim       0.00      0.00      0.00       224
       I-art       0.00      0.00      0.00        47
       I-eve       0.00      0.00      0.00        17
       I-geo       0.00      0.00      0.00       223
       I-gpe       0.00      0.00      0.00         8
       I-nat       0.00      0.00      0.00         4
       I-org       0.00      0.00      0.00       604
       I-per       0.00      0.00      0.00       830
       I-tim       0.00      0.00      0.00       142
           O       0.00      0.00      0.00      3976

    accuracy              

In [39]:
### Catboost

In [40]:
pip -q install catboost

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [41]:
from catboost import *
from sklearn.utils.class_weight import compute_class_weight

In [42]:
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

In [43]:
class_weights

{'B-art': 8.24169460530326,
 'B-eve': 22.40347970173985,
 'B-geo': 0.7581730499635507,
 'B-gpe': 6.213465073529412,
 'B-nat': 58.91285403050109,
 'B-org': 0.7763040794648752,
 'B-per': 0.759621327040845,
 'B-tim': 2.12938026616269,
 'I-art': 10.26223908918406,
 'I-eve': 27.90608875128999,
 'I-geo': 2.1408439553479535,
 'I-gpe': 56.80882352941177,
 'I-nat': 122.35746606334841,
 'I-org': 0.7894030068603124,
 'I-per': 0.574863411211973,
 'I-tim': 3.3487306501547986,
 'O': 0.12002165991273896}

In [44]:
cb_model = CatBoostClassifier(verbose=False, class_weights=class_weights, grow_policy='SymmetricTree', iterations=500,
                                max_depth=6, learning_rate=0.1, l2_leaf_reg=1.9)
cb_model1 = cb_model.fit(X_train,y_train)
y_train_pred = cb_model1.predict(X_train)
y_val_pred = cb_model1.predict(X_val)
print(f1_score(y_train,y_train_pred,average='weighted'))
print(f1_score(y_val,y_val_pred,average='weighted'))

0.35598699989922256
0.324078936742648


In [45]:
print(classification_report(y_train,cb_model1.predict(X_train)))

              precision    recall  f1-score   support

       B-art       0.44      0.54      0.48       193
       B-eve       0.33      0.82      0.47        71
       B-geo       0.75      0.02      0.04      2098
       B-gpe       0.49      0.11      0.17       256
       B-nat       0.41      1.00      0.58        27
       B-org       0.68      0.02      0.03      2049
       B-per       0.63      0.02      0.04      2094
       B-tim       0.67      0.01      0.02       747
       I-art       0.40      0.82      0.54       155
       I-eve       0.30      0.98      0.46        57
       I-geo       0.45      0.02      0.04       743
       I-gpe       0.25      1.00      0.40        28
       I-nat       0.41      1.00      0.58        13
       I-org       0.73      0.01      0.02      2015
       I-per       0.93      0.01      0.03      2767
       I-tim       0.44      0.02      0.03       475
           O       0.51      0.98      0.67     13253

    accuracy              

In [46]:
print(classification_report(y_val,cb_model1.predict(X_val)))

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        25
       B-eve       0.00      0.00      0.00         9
       B-geo       0.00      0.00      0.00       270
       B-gpe       0.00      0.00      0.00        33
       B-nat       0.00      0.00      0.00         3
       B-org       0.00      0.00      0.00       263
       B-per       0.00      0.00      0.00       269
       B-tim       0.00      0.00      0.00        96
       I-art       0.00      0.00      0.00        20
       I-eve       0.00      0.00      0.00         8
       I-geo       0.00      0.00      0.00        95
       I-gpe       0.00      0.00      0.00         4
       I-nat       0.00      0.00      0.00         2
       I-org       0.00      0.00      0.00       259
       I-per       0.00      0.00      0.00       356
       I-tim       0.00      0.00      0.00        61
           O       0.50      0.98      0.66      1704

    accuracy              

In [47]:
print(classification_report(y_test,cb_model1.predict(X_test)))

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        58
       B-eve       0.00      0.00      0.00        22
       B-geo       0.00      0.00      0.00       630
       B-gpe       0.00      0.00      0.00        77
       B-nat       0.00      0.00      0.00         8
       B-org       0.00      0.00      0.00       615
       B-per       0.00      0.00      0.00       628
       B-tim       0.00      0.00      0.00       224
       I-art       0.00      0.00      0.00        47
       I-eve       0.00      0.00      0.00        17
       I-geo       0.00      0.00      0.00       223
       I-gpe       0.00      0.00      0.00         8
       I-nat       0.00      0.00      0.00         4
       I-org       0.00      0.00      0.00       604
       I-per       0.00      0.00      0.00       830
       I-tim       0.00      0.00      0.00       142
           O       0.50      0.98      0.66      3976

    accuracy              

In [48]:
### We observe that none of the baseline models are performing well due to multiple tags associated with a word

### TRANSFORMER MODEL

In [76]:
### NER LLM Model using Transformers

In [4]:
pip install -q accelerate==0.21.0 transformers==4.35.* trl==0.4.7 bitsandbytes==0.41.* evaluate seqeval

In [5]:
from datasets import *
from transformers import (AutoTokenizer,
                          DataCollatorForTokenClassification,
                          AutoModelForTokenClassification, TrainingArguments, Trainer)
import evaluate

In [6]:
### Load the datset
data = pd.read_csv('ner_dataset.csv',encoding='Windows-1252')

In [7]:
### Convert the tag into numbers
data['Tags'] = LabelEncoder().fit(data.Tag).transform(data.Tag)

In [8]:
LabelEncoder().fit(data.Tag).classes_

array(['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per',
       'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org',
       'I-per', 'I-tim', 'O'], dtype=object)

In [9]:
### Distribution of Tag and their number
data[['Tag','Tags']].value_counts()

Tag    Tags
O      16      887908
B-geo  2        37644
B-tim  7        20333
B-org  5        20143
I-per  14       17251
B-per  6        16990
I-org  13       16784
B-gpe  3        15870
I-geo  10        7414
I-tim  15        6528
B-art  0          402
B-eve  1          308
I-art  8          297
I-eve  9          253
B-nat  4          201
I-gpe  11         198
I-nat  12          51
Name: count, dtype: int64

In [10]:
### We want to use the entire sentence as input to maintain the coherence
data['Sentence #'] = data['Sentence #'].fillna(method='ffill')

In [11]:
### Select only the restricted columns
data = data[['Sentence #','Word','Tags']]

In [12]:
### Check for any null values
data.isnull().sum()

Sentence #     0
Word          10
Tags           0
dtype: int64

In [13]:
### We can drop the null or keep it
data['Word'] = data['Word'].fillna('')

In [14]:
### Convert the records into list based on the sentences
word_data = pd.DataFrame(data.groupby(['Sentence #'])['Word'].apply(list)).reset_index()
tags_data = pd.DataFrame(data.groupby(['Sentence #'])['Tags'].apply(list)).reset_index().drop(columns=['Sentence #'])

In [15]:
data_final = pd.concat([word_data,tags_data],axis=1)#.to_dict('records')
data_final = data_final.rename(columns={'Sentence #':'sentence_id'})
data_final['sentence_id'] = data_final['sentence_id'].str.replace(r'Sentence: ','')

In [16]:
### Split the data into train and test
train, test = train_test_split(data_final, test_size=0.3, random_state=10)

In [17]:
### COnvert the dataset into huggingface dataset format for data ingestion
train_dataset = Dataset.from_pandas(train).remove_columns(['__index_level_0__'])
test_dataset = Dataset.from_pandas(test).remove_columns(['__index_level_0__'])

In [18]:
train_dataset, test_dataset

(Dataset({
     features: ['sentence_id', 'Word', 'Tags'],
     num_rows: 33571
 }),
 Dataset({
     features: ['sentence_id', 'Word', 'Tags'],
     num_rows: 14388
 }))

In [19]:
### We are utilising the tiny bert model based on research paper - https://arxiv.org/abs/1909.10351
model_name = 'huawei-noah/TinyBERT_General_4L_312D'

In [20]:
### Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [24]:
### Convertion of words into tokens
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["Word"], truncation=True, max_length=512, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"Tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100) ### Converting [CLS] to -100
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100) ### Converting [SEP] to -100
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [25]:
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/33571 [00:00<?, ? examples/s]

Map:   0%|          | 0/14388 [00:00<?, ? examples/s]

In [26]:
### Data collater to pass and tokenize on the fly
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [27]:
### Labelling of tags
id2label = {
        0: 'B-art',
        1: 'B-eve',
        2: 'B-geo',
        3: 'B-gpe',
        4: 'B-nat',
        5: 'B-org',
        6: 'B-per',
        7: 'B-tim',
        8: 'I-art',
        9: 'I-eve',
        10: 'I-geo',
        11: 'I-gpe',
        12: 'I-nat',
        13: 'I-org',
        14: 'I-per',
        15: 'I-tim',
        16: 'O',
}
label2id = {
        'B-art':0,
        'B-eve':1,
        'B-geo':2,
        'B-gpe':3,
        'B-nat':4,
        'B-org':5,
        'B-per':6,
        'B-tim':7,
        'I-art':8,
        'I-eve':9,
        'I-geo':10,
        'I-gpe':11,
        'I-nat':12,
        'I-org':13,
        'I-per':14,
        'I-tim':15,
        'O' :16,
}

In [28]:
### Load the model with number of token classes and labelling tags
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=17, id2label=id2label, label2id=label2id)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
seqeval = evaluate.load("seqeval")

In [30]:
label_list = ['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']

In [31]:
### Compute metrics to evaluate the performance at each epoch
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [32]:
### Training arguments
training_args = TrainingArguments(
    output_dir="logs",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    eval_accumulation_steps=1,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    optim="paged_lion_32bit",
    save_strategy="epoch",
    max_grad_norm=0.3,
    warmup_ratio=0.01
)

In [35]:
### Wrap everything into Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [36]:
### Train the model
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4784,0.439376,0.229642,0.263325,0.245333,0.872272
2,0.3734,0.339269,0.452619,0.449538,0.451073,0.906086
3,0.3519,0.328201,0.568435,0.53866,0.553147,0.916832
4,0.3536,0.339158,0.588512,0.547608,0.567324,0.917565
5,0.365,0.349413,0.619187,0.535123,0.574094,0.916489
6,0.3652,0.345027,0.625492,0.567199,0.594921,0.919229
7,0.36,0.342575,0.605503,0.589375,0.597331,0.92123
8,0.3614,0.343006,0.624368,0.594429,0.609031,0.921931
9,0.3599,0.344467,0.6033,0.589078,0.596104,0.921525
10,0.3641,0.341465,0.604365,0.599334,0.601839,0.922947


TrainOutput(global_step=41980, training_loss=0.38392003365162725, metrics={'train_runtime': 1388.657, 'train_samples_per_second': 483.503, 'train_steps_per_second': 30.231, 'total_flos': 821288975724354.0, 'train_loss': 0.38392003365162725, 'epoch': 20.0})

In [37]:
### Save the model objects for inference and production
trainer.save_model('ner_model')

In [48]:
# from huggingface_hub import notebook_login
# notebook_login()

In [82]:
# trainer.push_to_hub('sun00009/ner_model')

### LLM INFERENCE

In [38]:
### Creating prediction pipeline for inference

In [39]:
from transformers import pipeline
import torch

In [40]:
### Load the trained model
ner_model = AutoModelForTokenClassification.from_pretrained('ner_model')

In [41]:
### Create a pipeline for inference
ner_pipe = pipeline('ner', model=ner_model, tokenizer=tokenizer)

In [42]:
inputs = " ". join(test_dataset[0]['Word'])

In [43]:
ner_pipe(inputs)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'B-geo',
  'score': 0.36887354,
  'index': 3,
  'word': 'kyrgyzstan',
  'start': 13,
  'end': 23},
 {'entity': 'I-per',
  'score': 0.24938515,
  'index': 14,
  'word': 'lo',
  'start': 81,
  'end': 83},
 {'entity': 'B-gpe',
  'score': 0.49053234,
  'index': 32,
  'word': 'russian',
  'start': 159,
  'end': 166}]

In [44]:
inputs = tokenizer(inputs, return_tensors="pt")

In [45]:
with torch.no_grad():
    logits = ner_model(**inputs).get('logits')

In [46]:
predictions = torch.argmax(logits, dim=2)

In [47]:
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
print(predicted_token_class[1:-1])

['O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O']


In [48]:
print(tokenized_test_dataset[0]['Word'])

['Officials', 'in', 'Kyrgyzstan', 'say', 'they', 'have', 'detained', 'about', '50', 'people', 'after', 'hundreds', 'of', 'looters', 'tore', 'through', 'a', 'Kurdish', 'village', 'to', 'protest', 'the', 'rape', 'of', 'a', 'four', 'year-old', 'Russian', 'girl', '.']


In [49]:
print(tokenized_test_dataset[0]['Tags'])

[16, 16, 2, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 3, 16, 16]


In [50]:
print(id2label)

{0: 'B-art', 1: 'B-eve', 2: 'B-geo', 3: 'B-gpe', 4: 'B-nat', 5: 'B-org', 6: 'B-per', 7: 'B-tim', 8: 'I-art', 9: 'I-eve', 10: 'I-geo', 11: 'I-gpe', 12: 'I-nat', 13: 'I-org', 14: 'I-per', 15: 'I-tim', 16: 'O'}


## DEPLOYMENT AND DEMO

### GRADIO

In [52]:
pip -q install gradio

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.1/318.1 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.0/145.0 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.9/129.9 kB[0m [31m16

In [53]:
import gradio as gr

In [66]:
### Wrap the inference pipeline
def ner_inference(input_text):
    ner_model = AutoModelForTokenClassification.from_pretrained('sun00009/ner_model')
    tokenizer = AutoTokenizer.from_pretrained('sun00009/ner_model')
    ner_pipe = pipeline('ner', model=ner_model, tokenizer=tokenizer)
    inputs = tokenizer(input_text, return_tensors="pt")
    with torch.no_grad():
        logits = ner_model(**inputs).get('logits')
    predictions = torch.argmax(logits, dim=2)
    predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
    return predicted_token_class[1:-1]

In [67]:
### Construct the gradio container
input_comp = gr.Textbox(type='text',label='Input Text')
output_comp = gr.Textbox(type='text', label='Output Text')
output = gr.Interface(fn=ner_inference, inputs=[input_comp],outputs=output_comp, css="footer {visibility: hidden}")

In [68]:
### Launch for demo inference
output.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://15cd154e8131921e12.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




### AWS

In [1]:
pip -q install sagemaker

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.2/82.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [2]:
!pip -q install aws configure
!pip -q install awscli

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.3/160.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.9/225.9 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.7/283.7 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for aws (setup.py) ... [?25l[?25hdone
  Building wheel for configure (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver

In [3]:
### Setup variables for local machine
text = '''
[default]
aws_access_key_id = <key>
aws_secret_access_key = <key>
region = us-east-1
'''

In [4]:
path = "/content/awscli.ini"
with open(path, 'w') as f:
   f.write(text)

In [5]:
# !cat /content/awscli.ini

In [6]:
import os
!export AWS_SHARED_CREDENTIALS_FILE=/content/awscli.ini
path = "/content/awscli.ini"
os.environ['AWS_SHARED_CREDENTIALS_FILE'] = path
print(os.environ['AWS_SHARED_CREDENTIALS_FILE'])

/content/awscli.ini


In [7]:
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel
from sagemaker import image_uris

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [8]:
### Create sagemaker session
sg_session = sagemaker.Session(boto3.session.Session())

In [9]:
### Get role for the session
sg_role = 'arn:aws:iam::<id>:role/sagemaker_ner_model'

In [10]:
### Fetch repository from huggingface and save it to S3 bucket service
repository = "sun00009/ner_model"
model_id=repository.split("/")[-1]
s3_location=f"s3://{sg_session.default_bucket()}/custom_inference/{model_id}/model.tar.gz"

In [11]:
### Create custom inference for desired output
%%writefile code/inference.py

from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
import torch

def inference_gen(data):
    ### Custom Inference
    tokenizer = AutoTokenizer.from_pretrained('sun00009/ner_model')
    model = AutoModelForTokenClassification.from_pretrained('sun00009/ner_model')
    ner_pipe = pipeline('ner', model=model, tokenizer=tokenizer)
    input_text = data['inputs']
    inputs = tokenizer(input_text, return_tensors="pt")
    with torch.no_grad():
        logits = ner_model(**inputs).get('logits')
    predictions = torch.argmax(logits, dim=2)
    predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
    return predicted_token_class[1:-1]

Writing code/inference.py


In [12]:
!git lfs install
!git clone https://huggingface.co/$repository

Git LFS initialized.
Cloning into 'ner_model'...
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 21 (delta 1), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (21/21), 317.43 KiB | 4.41 MiB/s, done.


In [13]:
!cp -r code/ $model_id/code

In [14]:
%cd $model_id
!tar zcvf model.tar.gz *

/content/ner_model
code/
code/inference.py
config.json
model.safetensors
README.md
runs/
runs/Jun24_09-32-21_ee951b32348a/
runs/Jun24_09-32-21_ee951b32348a/events.out.tfevents.1719221555.ee951b32348a.15911.0
runs/Jun24_08-35-03_ee951b32348a/
runs/Jun24_08-35-03_ee951b32348a/events.out.tfevents.1719220504.ee951b32348a.433.1
runs/Jun24_08-35-03_ee951b32348a/events.out.tfevents.1719218104.ee951b32348a.433.0
special_tokens_map.json
tokenizer_config.json
tokenizer.json
vocab.txt


In [15]:
!aws s3 cp model.tar.gz $s3_location

upload: ./model.tar.gz to s3://sagemaker-us-east-1-186529496743/custom_inference/ner_model/model.tar.gz


In [20]:
### Using directly from hub
hub = {
  'HF_MODEL_ID':'sun00009/ner_model',
  'HF_TASK':'ner'
}

In [None]:
### Load model directly
huggingface_model = HuggingFaceModel(
  env=hub,
  # model_data = s3_location,
  image_uri = image_uris.retrieve(framework='huggingface',region='us-east-1', image_scope='inference',version='4.37.0',
                    py_version='py310',base_framework_version='pytorch2.1.0',sagemaker_session=sg_session,
                    instance_type='ml.g4dn.xlarge'),
	role=sg_role
)

In [22]:
### Load model form S3 container
huggingface_model = HuggingFaceModel(
  # env=hub,
  model_data = s3_location,
  image_uri = image_uris.retrieve(framework='huggingface',region='us-east-1', image_scope='inference',version='4.37.0',
                    py_version='py310',base_framework_version='pytorch2.1.0',sagemaker_session=sg_session,
                    instance_type='ml.g4dn.xlarge'),
	role=sg_role
)

In [23]:
# Deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
		 initial_instance_count=1,
		 instance_type='ml.g4dn.xlarge'
)

-----------!

In [24]:
data = {"inputs": "My name is Jack. I went to America."}

In [25]:
preds = predictor.predict(data)

In [26]:
print(preds)

[{'entity': 'B-per', 'score': 0.2726770341396332, 'index': 4, 'word': 'jack', 'start': 11, 'end': 15}, {'entity': 'B-geo', 'score': 0.28789615631103516, 'index': 9, 'word': 'america', 'start': 27, 'end': 34}]


In [27]:
### Delete endpoint at end of inference for additional cost
predictor.delete_endpoint()