#### Training custom NERDA NER Model

In [1]:
import sys
import os
import requests
import json
import time
import datetime
import pytz

import numpy as np
import pandas as pd
import pickle
import torch

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_colwidth', None)

In [3]:
!nvidia-smi

Thu Oct 27 02:53:14 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    31W /  70W |      0MiB / 15360MiB |     10%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

True
0
1
Tesla T4


In [5]:
# !pip install NERDA

In [6]:
from NERDA.models import NERDA

#### Copy files to local FS from GCP bucket

In [7]:
def list_files(dir_name):
    try:
        list_of_files = filter( lambda x: os.path.isfile(os.path.join(dir_name, x)),
                        os.listdir(dir_name) )      
        files_with_size = [ (file_name, os.stat(os.path.join(dir_name, file_name)).st_size) 
                    for file_name in list_of_files if not file_name.startswith('.')]
        for file_name, size in files_with_size:
            print(file_name, ' -->', size) 
    except OSError as e:
        print("Error: %s : %s" % (dir_name, e.strerror))

In [8]:
path_ner = '/home/jupyter/data/ner'
path_ner_model = '/home/jupyter/data/ner/ner_model_saved'

os.makedirs(path_ner, exist_ok=True)
os.makedirs(path_ner_model, exist_ok=True)

In [9]:
def get_gcs_data (bucket_name, folder_name, file_name, path_local):
    url = 'https://storage.googleapis.com/' + bucket_name + '/' + folder_name + '/' + file_name
    r = requests.get(url)
    open(path_local + '/' + file_name , 'wb').write(r.content)

In [10]:
bucket_name = 'msca-bdp-data-open'
folder_name = 'ner'
file_name = ['train.txt', 'test.txt']
path_local = path_ner

os.makedirs(path_local, exist_ok=True)

for file in file_name:
    get_gcs_data (bucket_name = bucket_name,
                 folder_name = folder_name,
                 file_name = file,
                 path_local = path_local)
    print('Downloaded: ' + file)

Downloaded: train.txt
Downloaded: test.txt


#### Train customer model

In [11]:
%%time

train_df = pd.read_csv(path_ner+'/train.txt', sep='\t', header=None, skip_blank_lines=False, names=['tag', 'text'])
validate_df = pd.read_csv(path_ner+'/test.txt', sep='\t', header=None, skip_blank_lines=False, names=['tag', 'text'])

print(f'Records to train: {train_df.shape[0]}')
print(f'Records to validate: {validate_df.shape[0]}')

Records to train: 109266
Records to validate: 27129
CPU times: user 39.9 ms, sys: 11.9 ms, total: 51.8 ms
Wall time: 49.5 ms


In [12]:
def conll_to_sent(df_tokens):

    '''Create a Sentence ID
    Tokens from the same sentence should get the same sent_id,
    unless separated by NaN in the next row'''

    df_tokens['sent_id'] = df_tokens['tag'].shift(1).isna().cumsum()
    df_tokens = df_tokens.query('text.notnull()', engine='python')


    '''Combine tokens and tags so each sentence
    resides in a single row of Pandas DF'''

    dict_list=list(zip(df_tokens.groupby('sent_id')['text'].apply(list),df_tokens.groupby('sent_id')['tag'].apply(list)))
    
    df_sentences = pd.DataFrame(dict_list, columns=['sentences','tags'],index=None)
    
    dataset = {}
    dataset['sentences'] = df_sentences['sentences'].to_list()
    dataset['tags'] = df_sentences['tags'].to_list()
    dict_sentences = json.loads(json.dumps(dataset))
    
    
    return df_sentences, dict_sentences

In [13]:
%%time

train_sentences, train_dict = conll_to_sent(train_df)
validate_sentences, validate_dict = conll_to_sent(validate_df)


'''Drop tags for O and missing records'''
tags_to_drop = ['O']
tag_scheme_df = train_df.query('tag.notnull() and tag not in @tags_to_drop', engine='python')
tag_scheme = tag_scheme_df['tag'].unique().tolist()

CPU times: user 893 ms, sys: 12.4 ms, total: 906 ms
Wall time: 904 ms


In [14]:
train_sentences.head(10)

Unnamed: 0,sentences,tags
0,"[what, movies, star, bruce, willis]","[O, O, O, B-ACTOR, I-ACTOR]"
1,"[show, me, films, with, drew, barrymore, from, the, 1980s]","[O, O, O, O, B-ACTOR, I-ACTOR, O, O, B-YEAR]"
2,"[what, movies, starred, both, al, pacino, and, robert, deniro]","[O, O, O, O, B-ACTOR, I-ACTOR, O, B-ACTOR, I-ACTOR]"
3,"[find, me, all, of, the, movies, that, starred, harold, ramis, and, bill, murray]","[O, O, O, O, O, O, O, O, B-ACTOR, I-ACTOR, O, B-ACTOR, I-ACTOR]"
4,"[find, me, a, movie, with, a, quote, about, baseball, in, it]","[O, O, O, O, O, O, O, O, O, O, O]"
5,"[what, movies, have, mississippi, in, the, title]","[O, O, O, B-TITLE, O, O, O]"
6,"[show, me, science, fiction, films, directed, by, steven, spielberg]","[O, O, B-GENRE, I-GENRE, I-GENRE, O, O, B-DIRECTOR, I-DIRECTOR]"
7,"[do, you, have, any, thrillers, directed, by, sofia, coppola]","[O, O, O, O, B-GENRE, O, O, B-DIRECTOR, I-DIRECTOR]"
8,"[what, leonard, cohen, songs, have, been, used, in, a, movie]","[O, B-SONG, I-SONG, I-SONG, O, O, O, O, O, O]"
9,"[show, me, films, elvis, films, set, in, hawaii]","[O, O, O, B-ACTOR, O, B-PLOT, I-PLOT, I-PLOT]"


In [15]:
# transformer = 'studio-ousia/luke-base'
#transformer = 'studio-ousia/luke-large'
# transformer = 'bert-base-uncased'
#transformer = 'google/bigbird-roberta-base'
transformer = 'distilbert-base-uncased'
#transformer = 'distilroberta-base'
#transformer = 'roberta-base'

# hyperparameters for training
training_hyperparameters = {
'epochs' : 8,
'warmup_steps' : 400,                                                   
'train_batch_size': 8, #very sensitive to batch size; running 8 by default
 'learning_rate': 0.0001
}

In [16]:
ner_model = NERDA(dataset_training = train_dict,
              dataset_validation = validate_dict,
              tag_scheme = tag_scheme,
              tag_outside = 'O',
              max_len = 512,
              transformer = transformer,
              dropout = 0.1,
              hyperparameters = training_hyperparameters)

Device automatically set to: cuda


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
%%time

ner_model.train()




 Epoch 1 / 8


100% 1222/1222 [08:21<00:00,  2.44it/s]
100% 306/306 [00:42<00:00,  7.14it/s]


Train Loss = 0.4277833937754955 Valid Loss = 0.838315933066256

 Epoch 2 / 8


100% 1222/1222 [08:20<00:00,  2.44it/s]
100% 306/306 [00:43<00:00,  7.11it/s]


Train Loss = 0.19406158904851387 Valid Loss = 1.0125437369463612

 Epoch 3 / 8


100% 1222/1222 [08:19<00:00,  2.45it/s]
100% 306/306 [00:42<00:00,  7.16it/s]


Train Loss = 0.13499252143407045 Valid Loss = 0.4218207932464638

 Epoch 4 / 8


100% 1222/1222 [08:19<00:00,  2.44it/s]
100% 306/306 [00:42<00:00,  7.17it/s]


Train Loss = 0.10113219294551098 Valid Loss = 0.3614133909033471

 Epoch 5 / 8


100% 1222/1222 [08:20<00:00,  2.44it/s]
100% 306/306 [00:42<00:00,  7.21it/s]


Train Loss = 0.07140038578345025 Valid Loss = 0.34318833258208437

 Epoch 6 / 8


100% 1222/1222 [08:19<00:00,  2.45it/s]
100% 306/306 [00:42<00:00,  7.21it/s]


Train Loss = 0.05055675172470297 Valid Loss = 0.3593575039676283

 Epoch 7 / 8


100% 1222/1222 [08:18<00:00,  2.45it/s]
100% 306/306 [00:42<00:00,  7.22it/s]


Train Loss = 0.03443510408420649 Valid Loss = 0.3599668693250529

 Epoch 8 / 8


100% 1222/1222 [08:19<00:00,  2.45it/s]
100% 306/306 [00:42<00:00,  7.19it/s]

Train Loss = 0.024429278908931512 Valid Loss = 0.3381986507437925
CPU times: user 16min 7s, sys: 56min 14s, total: 1h 12min 21s
Wall time: 1h 12min 20s





'Model trained successfully'

#### Save model

In [18]:
torch.save(ner_model, path_ner_model + '/' + transformer + '_ner_model.pt')

In [19]:
!gsutil -m -q cp -n '/home/jupyter/data/ner/ner_model_saved/*' 'gs://msca-bdp-data/ner_models/'

In [20]:
list_files(path_ner_model)

bert-base-uncased_ner_model.pt  --> 268993141
distilbert-base-uncased_ner_model.pt  --> 268998259


#### Evaluate model

In [21]:
# !gsutil -m -q cp -n 'gs://msca-bdp-data/ner_models/*' '/home/jupyter/data/ner/ner_model_saved/' 

In [22]:
# ner_model = torch.load(path_ner_model + '/' + transformer + '_ner_model.pt')

In [23]:
ner_model.evaluate_performance(validate_dict)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Level,F1-Score,Precision,Recall
0,B-ACTOR,0.942249,0.930372,0.954433
1,I-ACTOR,0.93908,0.930524,0.947796
2,B-YEAR,0.952982,0.963121,0.943056
3,B-TITLE,0.889083,0.87307,0.905694
4,B-GENRE,0.946484,0.935315,0.957923
5,I-GENRE,0.795,0.893258,0.716216
6,B-DIRECTOR,0.914869,0.948235,0.883772
7,I-DIRECTOR,0.912134,0.947826,0.879032
8,B-SONG,0.693069,0.744681,0.648148
9,I-SONG,0.803653,0.88,0.739496


### Check model performance on text sentences

In [24]:
text = '''what movies star cameron diaz'''
ner_model.predict_text(text)

([['what', 'movies', 'star', 'cameron', 'diaz']],
 [['O', 'O', 'O', 'B-ACTOR', 'I-ACTOR']])

In [25]:
def single_sent_test(text):
    
    single_sent_test = ner_model.predict_text(text)

    df_w = pd.DataFrame({'words': single_sent_test[0]})
    df_t = pd.DataFrame({'tags': single_sent_test[1]})

    test_explode_words = df_w.explode('words', ignore_index=True)
    test_explode_tags = df_t.explode('tags', ignore_index=True)

    test_sent_df = test_explode_words.join(test_explode_tags, how='left')
    
    return test_sent_df

In [26]:
text = '''Brendan Fraser cast as villain in upcoming Batgirl film... after the actor wrapped Martin Scorsese film Killers of the Flower Moon'''
single_sent_test(text=text)

Unnamed: 0,words,tags
0,Brendan,B-ACTOR
1,Fraser,I-ACTOR
2,cast,O
3,as,O
4,villain,O
5,in,O
6,upcoming,O
7,Batgirl,B-PLOT
8,film,O
9,...,O


In [27]:
text = '''‘The Unforgivable’ Trailer: Sandra Bullock Is an Ex-Con Seeking Redemption in Netflix Drama'''
single_sent_test(text=text)

Unnamed: 0,words,tags
0,‘,O
1,The,O
2,Unforgivable,B-RATINGS_AVERAGE
3,’,O
4,Trailer,B-TRAILER
5,:,O
6,Sandra,B-ACTOR
7,Bullock,I-ACTOR
8,Is,O
9,an,O


In [28]:
text = '''Brendan Fraser cast as villain in upcoming Batgirl film... after the actor wrapped Martin Scorsese film Killers of the Flower Moon'''
single_sent_test(text=text)

Unnamed: 0,words,tags
0,Brendan,B-ACTOR
1,Fraser,I-ACTOR
2,cast,O
3,as,O
4,villain,O
5,in,O
6,upcoming,O
7,Batgirl,B-PLOT
8,film,O
9,...,O


In [29]:
text = '''Martin Scorsese cast as villain in upcoming Batgirl film... after the actor wrapped Brendan Fraser film Killers of the Flower Moon'''
single_sent_test(text=text)

Unnamed: 0,words,tags
0,Martin,B-DIRECTOR
1,Scorsese,I-DIRECTOR
2,cast,O
3,as,O
4,villain,O
5,in,O
6,upcoming,O
7,Batgirl,B-PLOT
8,film,O
9,...,O


In [30]:
text = '''Nick Kadochnikov cast as villain in upcoming Batgirl film... after the actor wrapped Martin Scorsese film Killers of the Flower Moon'''
single_sent_test(text=text)

Unnamed: 0,words,tags
0,Nick,B-DIRECTOR
1,Kadochnikov,I-DIRECTOR
2,cast,O
3,as,O
4,villain,O
5,in,O
6,upcoming,O
7,Batgirl,B-PLOT
8,film,O
9,...,O


In [31]:
text = '''Brendan Fraser cast as villain in upcoming Batgirl film... after the actor wrapped Nick Kadochnikov film Killers of the Flower Moon'''
single_sent_test(text=text)

Unnamed: 0,words,tags
0,Brendan,B-ACTOR
1,Fraser,I-ACTOR
2,cast,O
3,as,O
4,villain,O
5,in,O
6,upcoming,O
7,Batgirl,B-PLOT
8,film,O
9,...,O


In [32]:
datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Wed, 26 October 2022 23:06:36'