In [3]:
import numpy as np
import sys
sys.path.append("../src")
sys.path.append("../")
from bert_embedding import *
from datasets import load_dataset

# Data

## 1. SEC filings

In [3]:
test_link ="https://github.com/juand-r/entity-recognition-datasets/tree/master/data/SEC-filings/CONLL-format/data/test/FIN3"
train_link = "https://github.com/juand-r/entity-recognition-datasets/tree/master/data/SEC-filings/CONLL-format/data/train/FIN5.txt"

In [19]:
# !wget https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/test/FIN3.txt -P ../data/sec_ner

In [20]:
# !wget https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/train/FIN5.txt -P ../data/sec_ner

## 2. BTC

In [21]:
# !wget https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/BTC/CONLL-format/data/h.conll -P ../data/ner_btc

## 3. Wikigold

In [23]:
# !wget https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/wikigold.conll.txt -P ../data/ner_wikigold

# Data Clean

In [71]:
# load data
def load_ner_data(path, separator = " "):
    with open(path) as f:
        text = f.read().split("\n\n")

    output = []
    for line in text:
        feature_label = []
        line = line.split("\n")
        for entry in line:
            feature_label.append(tuple(entry.split(separator)))
        output.append(feature_label)
    return output

# get words and tags
def unique_words_tags(data):
    unique_words = []
    unique_tags = []
    for sent in data:
        unique_words.extend(list(set(np.array(sent)[:,0])))
        unique_tags.extend(list(set(np.array(sent)[:,-1])))
        
    return set(unique_words), set(unique_tags)

# get words and tags distributions
def distributions_words_tags(data):
    unique_words = {}
    unique_tags = {}
    for i in range(len(data)-1):
        sent = data[i]
        for t in sent:
            word = t[0]
            tag = t[-1]
            
            if word in unique_words:
                unique_words[word] += 1
            else:
                unique_words[word] = 1
                
            if tag in unique_tags:
                unique_tags[tag] += 1
            else:
                unique_tags[tag] = 1
                
    return unique_words, unique_tags

def sent_to_tuple(sent):
    ner_tags = sent['ner_tags']
    pos_tags = sent['pos_tags']
    tokens = sent['tokens']
    sent_list = []
    for i in range(len(sent['ner_tags'])):
        sent_list.append((tokens[i], pos_list[pos_tags[i]], label_list[ner_tags[i]]))
    return sent_list

## Conll2003

In [5]:
from datasets import load_dataset
dataset = load_dataset('conll2003')

Reusing dataset conll2003 (/Users/yuchen.zhang/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


In [55]:
label_list = dataset['train'].features['ner_tags'].feature.names
pos_list = dataset['train'].features['pos_tags'].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [56]:
def sent_to_tuple(sent):
    ner_tags = sent['ner_tags']
    pos_tags = sent['pos_tags']
    tokens = sent['tokens']
    sent_list = []
    for i in range(len(sent['ner_tags'])):
        sent_list.append((tokens[i], pos_list[pos_tags[i]], label_list[ner_tags[i]]))
    return sent_list

In [57]:
sent_to_tuple(dataset['train'][0])

[('EU', 'NNP', 'B-ORG'),
 ('rejects', 'VBZ', 'O'),
 ('German', 'JJ', 'B-MISC'),
 ('call', 'NN', 'O'),
 ('to', 'TO', 'O'),
 ('boycott', 'VB', 'O'),
 ('British', 'JJ', 'B-MISC'),
 ('lamb', 'NN', 'O'),
 ('.', '.', 'O')]

In [58]:
conll = [sent_to_tuple(dataset['train'][x]) for x in range(len(dataset['train']))]

In [59]:
conll[0]

[('EU', 'NNP', 'B-ORG'),
 ('rejects', 'VBZ', 'O'),
 ('German', 'JJ', 'B-MISC'),
 ('call', 'NN', 'O'),
 ('to', 'TO', 'O'),
 ('boycott', 'VB', 'O'),
 ('British', 'JJ', 'B-MISC'),
 ('lamb', 'NN', 'O'),
 ('.', '.', 'O')]

In [63]:
unique_words_tags(conll)[1]

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

## Tech

In [65]:
import re

In [66]:
tech = load_ner_data("/Users/yuchen.zhang/Documents/Projects/domain-adaptation-nlp/data/ner_tech/tech_test.txt")

In [67]:
def transform_label(sent):
    return [(t[0], re.sub("E-","I-",re.sub("S-","B-",t[1]))) for t in sent]

In [68]:
tech = [transform_label(x) for x in tech]

In [69]:
unique_words_tags(tech)[1]

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

## SEC

In [41]:
sec_path = "../data/ner_sec/FIN5.txt"
sec = load_ner_data(sec_path)

In [42]:
print(len(sec))
# sec[1]

1170


In [84]:
unique_words_tags(sec)[1]

{'', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

## BTC

In [43]:
btc = []
for data in ["a", "b", "e", "f", "g", "h"]:
    btc.extend(load_ner_data("../data/ner_btc/" + data + ".conll", "\t"))
print(len(btc))
# btc[1]

9345


In [85]:
unique_words_tags(btc)[1]

{'', 'B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O'}

## wiki

In [47]:
wiki = load_ner_data("../data/ner_wikigold/" + "wikigold" + ".conll.txt", " ")
print(len(wiki))
print(wiki[1][0][-1])

1842
O


In [48]:
for data in [conll2003, wiki, sec, btc]:
    print(distributions_words_tags(data)[1])
# only wiki and sec works here

{'O': 170524, 'I-ORG': 10001, 'I-MISC': 4556, 'I-PER': 11128, 'I-LOC': 8286, 'B-LOC': 11, 'B-MISC': 37, 'B-ORG': 24}
{'I-MISC': 1392, 'O': 32721, 'I-ORG': 1958, 'I-PER': 1634, 'I-LOC': 1447}
{'O': 39485, 'I-ORG': 384, 'I-LOC': 356, 'I-PER': 783, 'I-MISC': 7}
{'O': 131814, 'B-LOC': 2822, 'B-PER': 7928, 'B-ORG': 4135, 'I-ORG': 1176, 'I-PER': 1554, 'I-LOC': 958, '': 5}


In [49]:
for sent in conll:
    for t in sent:
        if t[-1] =="":
            print(sent)

# other sandbox

In [21]:
def sent2labels(sent):
    return [label for token, label in sent]

In [22]:
def sent2labels1(sent):
    return [t[-1] for t in sent]

In [17]:
sent2labels(wiki[1])

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'I-MISC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'I-MISC',
 'I-MISC',
 'O',
 'I-MISC',
 'O']

In [370]:
sent2labels1(wiki[0])

['I-MISC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'I-MISC',
 'O',
 'O',
 'O',
 'I-ORG',
 'I-ORG',
 'I-ORG',
 'I-ORG',
 'O']

In [26]:
def word2features(sent, i):
    """
    The function generates all features
    for the word at position i in the
    sentence.
    """
    word = sent[i][0]
    f = tokenize_encode_bert_sentences_sample(tokenizer_d, model_d, word)[0]
    features = {}
    for j in range(len(f)):
        features[str(j)] = f[j]
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [24]:
labels = [sent2labels1(s) for s in wiki]

In [29]:
features = [sent2features(s) for s in wiki]

In [None]:
features

In [392]:
# features[0:2]

In [438]:
haha = tokenize_encode_bert_sentences_sample(tokenizer_d, model_d, "haha")

In [437]:
a = sec[-2:]
a

[[('/', 'NN', '-', 'O'),
  ('s', 'NNS', '-', 'O'),
  ('/', ':', '-', 'O'),
  ('Bing', 'VBG', '-', 'I-PER'),
  ('Yu', 'NNP', '-', 'I-PER')],
 [('',)]]

In [442]:
# haha[0]
# haha_f = {}
# for j in range(len(haha[0])):
#     haha_f[str(j)] = haha[0][j]
# haha_f

# Model

https://towardsdatascience.com/named-entity-recognition-ner-meeting-industrys-requirement-by-applying-state-of-the-art-deep-698d2b3b4ede
https://www.depends-on-the-definition.com/sequence-tagging-lstm-crf/

In [28]:
tokenizer_d = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_d = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [317]:
words_wiki, tags = unique_words_tags(wiki)
words_sec, tags = unique_words_tags(sec)
print(len(words_wiki), len(words_sec), len(words_wiki | words_sec))

8506 3513 10694


In [284]:
sent_lens = [len(x) for x in wiki]
print(max([len(x) for x in wiki]), max([len(x) for x in sec]))

144 413


In [460]:
max_len = 200
words_wiki.remove("")
words_sec.remove("")
word2idx = {w: i + 1 for i, w in enumerate(words_wiki | words_sec)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [291]:
from keras.utils import to_categorical
y_wiki = [to_categorical(i, num_classes=len(tags)) for i in y]
y_wiki[0][0]

y_sec = [to_categorical(i, num_classes=len(tags)) for i in y]
y_sec[0][0]

array([[1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.]], dtype=float32)

In [453]:
from sklearn.model_selection import train_test_split
train_wiki, test_wiki = train_test_split(wiki)

In [506]:
words_list.index("The")

3340

In [449]:
encoded_ner_corpus = tokenize_encode_bert_sentences(tokenizer_d, model_d, words_list, "../data/all_bert/encoded_ner_corpus")

In [504]:
tokenize_encode_bert_sentences_sample(tokenizer_d, model_d, ["The"])[0][:10]

array([-0.24184825, -0.14191352,  0.09101695, -0.10449045, -0.07760043,
       -0.04267633,  0.00831679,  0.27026916, -0.18425259, -0.22167444],
      dtype=float32)

In [503]:
encoded_ner_corpus[word2idx["The"]-1][:10]

array([-0.30622771, -0.13204339,  0.0283046 , -0.07296999, -0.01897933,
       -0.09043107,  0.10871619,  0.03657816, -0.14900076, -0.30661121])

In [477]:
ner_bert = np.load("../data/all_bert/encoded_ner_corpus.npy")

In [502]:
if (ner_bert[word2idx["Randall"]-1][:10] != encoded_ner_corpus[word2idx["Randall"]-1][:10]).any():
    print("hha")
else:
    print("right")

right


In [510]:
word2idx = {w: i for i, w in enumerate(words_list)}

In [512]:
word2idx['The']

3340

In [513]:
words = list(words_wiki | words_sec)

In [516]:
words.sort()
len(words), len(words_list)

(10693, 10693)

# Multiclass

In [15]:
>>> import numpy as np
>>> from sklearn.multiclass import OneVsRestClassifier
>>> from sklearn.svm import SVC
>>> X = np.array([
...     [10, 10],
...     [8, 10],
...     [-5, 5.5],
...     [-5.4, 5.5],
...     [-20, -20],
...     [-15, -20]
... ])
>>> y = np.array([0, 0, 1, 1, 2, 2])
>>> clf = OneVsRestClassifier(SVC(kernel="linear")).fit(X, y)
>>> clf.predict([[-19, -20], [9, 9], [-5, 5]])

array([2, 0, 1])

In [18]:
for i in range(3):
    print(clf.estimators_[i].coef_)

[[0.13738441 0.04755614]]
[[-0.22718445  0.21206928]]
[[-0.02586172 -0.0686952 ]]


In [86]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import BertTokenizer, BertModel, TokenClassificationPipeline

In [82]:
tokenizer_cased = BertTokenizer.from_pretrained('bert-base-cased')
model_cased = BertModel.from_pretrained('bert-base-cased')
tokenizer_ner = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model_ner = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

In [83]:
tokenizer_ner(["I am happy"])

{'input_ids': [[101, 146, 1821, 2816, 102]], 'token_type_ids': [[0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1]]}

In [118]:
encoded = tokenizer_cased(["I am happy"], return_tensors='pt', truncation=True, padding=True)
encoded

{'input_ids': tensor([[ 101,  146, 1821, 2816,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [120]:
model_cased(**encoded)[0]

tensor([[[ 0.3082,  0.1167, -0.0885,  ..., -0.0279,  0.3513, -0.1974],
         [ 0.1656, -0.2242,  0.1945,  ...,  0.3512,  0.0907,  0.4365],
         [ 0.1831,  0.0775, -0.1243,  ...,  0.3164, -0.1820,  0.2935],
         [ 0.2433,  0.0329,  0.1429,  ...,  0.4797,  0.1774,  0.1627],
         [ 0.5956,  0.4143, -0.5277,  ...,  0.2868,  0.6934, -0.1289]]],
       grad_fn=<NativeLayerNormBackward>)

In [124]:
model_ner.bert(**encoded)[0]

tensor([[[-0.1893, -0.3310,  0.7314,  ...,  0.4139,  0.4107,  0.1432],
         [-0.2228, -0.4651,  1.3479,  ...,  0.3291, -0.1585,  0.7715],
         [-0.2823, -0.3869,  1.1078,  ...,  0.4386,  0.1224,  0.4917],
         [-0.1279, -0.4046,  0.9786,  ...,  0.3558,  0.1769,  0.4220],
         [-0.8327, -0.3331,  0.8985,  ..., -0.5422,  0.1240, -0.2639]]],
       grad_fn=<NativeLayerNormBackward>)

# Test NER BERT

In [125]:
tokenizer_ner = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model_ner = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER").bert

In [139]:
encoded_input = tokenizer_ner("happy", return_tensors='pt', truncation=True, padding=True)
output = model_ner(**encoded_input)[0]
tokenizer_ner.convert_ids_to_tokens(encoded_input['input_ids'][0])

['[CLS]', 'happy', '[SEP]']

In [140]:
encoded_input1 = tokenizer_ner("happily", return_tensors='pt', truncation=True, padding=True)
output1 = model_ner(**encoded_input)[0]
tokenizer_ner.convert_ids_to_tokens(encoded_input1['input_ids'][0])

['[CLS]', 'happily', '[SEP]']

In [143]:
encoded_input_cased = tokenizer_cased("happy", return_tensors='pt', truncation=True, padding=True)
output_cased = model_cased(**encoded_input_cased)[0]
tokenizer_cased.convert_ids_to_tokens(encoded_input_cased['input_ids'][0])

['[CLS]', 'happy', '[SEP]']

In [144]:
encoded_input_cased1 = tokenizer_cased("happily", return_tensors='pt', truncation=True, padding=True)
output_cased1 = model_cased(**encoded_input_cased)[0]
tokenizer_cased.convert_ids_to_tokens(encoded_input_cased1['input_ids'][0])

['[CLS]', 'happily', '[SEP]']