#BERT

Clone the BERT-sklearn repository

In [0]:
!git clone -b master https://github.com/charles9n/bert-sklearn
%cd bert-sklearn
!pip install .

Import dependencies and load Train + Test data

In [0]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

data = pd.read_csv('Train_set.csv', encoding='latin1')

# Change de test sets

test = pd.read_csv('Test_set.csv')
# test = pd.read_csv('Zero-shot.csv')

Sentence Getter

In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(str(w), p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:
getter = SentenceGetter(data)
test_getter = SentenceGetter(test)

In [0]:
sentences = [[s[0] for s in sent] for sent in getter.sentences]
test_sentences = [[s[0] for s in sent] for sent in test_getter.sentences]

In [0]:
labels = [[s[2] for s in sent] for sent in getter.sentences]
test_labels = [[s[2] for s in sent] for sent in test_getter.sentences]

In [0]:
tokenized_texts = [[s for s in sent] for sent in sentences]
test_tokenized_texts = [[s for s in sent] for sent in test_sentences]


Define Test and Train data

In [0]:
X_train = tokenized_texts
y_train = labels
X_test = test_tokenized_texts
y_test = test_labels

In [0]:
import random
random.seed(23)
c = list(zip(X_train, y_train))
random.shuffle(c)
X_train, y_train = zip(*c)

The model

In [0]:
import os
import math
import random
import csv
import sys

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score
import statistics as stats

from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import BertTokenClassifier
from bert_sklearn import load_model

In [0]:
label_list = list(set(data.Tag.to_list()))

In [0]:
%%time
# define model

# Choose between BERT or SciBERT

model = BertTokenClassifier(bert_model='bert-base-cased',
# model = BertTokenClassifier(bert_model='scibert-scivocab-cased',
                            max_seq_length=178,
                            epochs=3,
                            gradient_accumulation_steps=4,
                            learning_rate=5e-5,
                            train_batch_size=16,
                            eval_batch_size=16,
                            validation_fraction=0., 
                            label_list=label_list,                           
                            ignore_label=['O'])


print(model)

# finetune model
model.fit(np.array(X_train), np.array(y_train))

# # score model
f1_test = model.score(X_test, y_test, 'macro')
print("Test f1: %0.02f"%(f1_test))

# make predictions
y_preds = model.predict(np.array(X_test))



Evaluation

In [0]:
!pip install seqeval

In [0]:
from nervaluate import Evaluator
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [0]:
print(classification_report(y_test, y_preds))

In [0]:
evaluator = Evaluator(y_test, y_preds, tags= [''], loader='list')

In [0]:
results, results_per_tag = evaluator.evaluate()

In [0]:
results