In [122]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import numpy as np
from datasets import load_dataset, load_metric
import matplotlib.pyplot as plt
import pickle
from sklearn.linear_model import LogisticRegression
from seqeval.metrics import classification_report

In [74]:
def get_embedding(text,tokenizer,model,is_split_into_words=True):
    inputs = tokenizer(text, return_tensors="pt", is_split_into_words=is_split_into_words)
    labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1
    outputs = model(**inputs, labels=labels)
    output = np.array(outputs.hidden_states[12][0].tolist())
    return output

    # back to torch tensor:
    # torch.from_numpy(output).float()
    
def ht_lr(train_features, train_labels, test_features):
    # aligning target domain to source domain
    lr_clf = LogisticRegression(max_iter = 10000)
    lr_clf.fit(train_features, train_labels)
    y_pred = lr_clf.predict(test_features)
    y_prob = lr_clf.predict_proba(test_features)[:, 0]
    y_prob = [(i, val, y_pred[i]) for i, val in enumerate(y_prob)]
    y_prob = sorted(y_prob, key=lambda x: x[1])
    y_prob_P = y_prob[:int(len(test_features) / 10)]
    y_prob_N = y_prob[-int(len(test_features) / 10):]

    sourcePos = [val for i, val in enumerate(train_features) if train_labels[i] == 1]
    sourceNeg = [val for i, val in enumerate(train_features) if train_labels[i] == 0]
    targetPos = [test_features[val[0]] for val in y_prob_P]
    targetNeg = [test_features[val[0]] for val in y_prob_N]
    v = np.mean(sourcePos, axis=0) - np.mean(sourceNeg, axis=0)
    u = np.mean(targetPos, axis=0) - np.mean(targetNeg, axis=0)
    c1 = np.mean(test_features, axis=0)
    c2 = np.mean(np.concatenate([sourcePos, sourceNeg], axis=0), axis=0)

    test_features = hh_lr(u, v, c1, c2, test_features)
    return test_features


def hh_lr(u, v, c1, c2, points):
    # household transformation
    u_mag = np.linalg.norm(u)
    u_unit = u / u_mag

    v_mag = np.linalg.norm(v)
    v_unit = v / v_mag

    # Scaling so pos-neg vectors have the same magnitude
    scaled_points = points * v_mag / u_mag
    scaled_c1 = c1 * v_mag / u_mag

    # gettinng dimension of vector space
    k = len(c2)

    # calculating isometric linear transformation: householder transformation
    A = np.eye(k) - (2 * (np.outer(u_unit - v_unit, u_unit - v_unit) / np.inner(u_unit - v_unit, u_unit - v_unit)))

    # applying isometric transformation
    points_after_isometric = scaled_points @ A.T
    c1_after_isometric = scaled_c1 @ A.T

    # translation
    points_after_translation = points_after_isometric + (c2 - c1_after_isometric)

    return points_after_translation

In [3]:
# CoNLL2003 pretrained BERT
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER", output_hidden_states=True)

In [168]:
# BERT example
inputs = tokenizer("embedding is good", return_tensors="pt")
labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1

outputs = model(**inputs, labels=labels)
loss = outputs.loss
logits = outputs.logits 
# equal to model.classifier(outputs.hidden_states[12]) 
# equal to model.classifier(torch.from_numpy(get_embedding("Hello, my dog is cute",tokenizer,model,is_split_into_words=False)).float()) 

In [174]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

['[CLS]', 'em', '##bed', '##ding', 'is', 'good', '[SEP]']

In [5]:
# load CoNLL2003
datasets = load_dataset("conll2003")
label_list = datasets["train"].features[f"{'ner'}_tags"].feature.names

Reusing dataset conll2003 (C:\Users\zhw027\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


In [37]:
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [6]:
datasets["train"][0]

{'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'id': '0',
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.']}

In [7]:
conll_tags_train = [[label_list[i] for i in dict['ner_tags']] for dict in datasets["train"]]
conll_tags_val = [[label_list[i] for i in dict['ner_tags']] for dict in datasets["validation"]]
conll_tags_test = [[label_list[i] for i in dict['ner_tags']] for dict in datasets["test"]]

conll_tokens_train = [dict['tokens'] for dict in datasets["train"]]
conll_tokens_val = [dict['tokens'] for dict in datasets["validation"]]
conll_tokens_test = [dict['tokens'] for dict in datasets["test"]]

In [72]:
conll_emb_train = []
for i,token in enumerate(conll_tokens_train):
    conll_emb_train.append(get_embedding(token,tokenizer,model))
    if i%100 == 0:
        print(i)
        
conll_emb_val = []
for i,token in enumerate(conll_tokens_val):
    conll_emb_val.append(get_embedding(token,tokenizer,model))
    if i%100 == 0:
        print(i)
        
conll_emb_test = []
for i,token in enumerate(conll_tokens_test):
    conll_emb_test.append(get_embedding(token,tokenizer,model))
    if i%100 == 0:
        print(i)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400


In [73]:
conll = {}
conll['train'] = {}
conll['train']['tokens'] = conll_tokens_train
conll['train']['tags'] = conll_tags_train
conll['train']['emb'] = conll_emb_train
conll['val'] = {}
conll['val']['tokens'] = conll_tokens_val
conll['val']['tags'] = conll_tags_val
conll['val']['emb'] = conll_emb_val
conll['test'] = {}
conll['test']['tokens'] = conll_tokens_test
conll['test']['tags'] = conll_tags_test
conll['test']['emb'] = conll_emb_test

In [32]:
## Save pickle
with open("../data/ner/CoNLL2003.pickle","wb") as fw:
    pickle.dump(conll, fw)

# ## Load pickle
# with open("../data/ner/CoNLL2003.pickle","rb") as fr:
#     conll = pickle.load(fr)



In [177]:
[0]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

# load tech_test data

In [180]:
len([y for x in conll['train']['emb'] for y in x])

300677

In [146]:
tech_test = {}
tech_test['test'] = {}
tech_test['test']['tokens'] = []
tech_test['test']['tags'] = []
tech_test['test']['emb'] = []

with open('../data/ner/tech_test.txt', 'r') as f:
    tokens = []
    tags = []
    for line in f.readlines():
        if line == '\n':
            tech_test['test']['tokens'].append(tokens)
            tech_test['test']['tags'].append(tags)
            tokens = []
            tags = []
        else:
            if line.split(' ')[0] != '':
                tokens.append(line.split(' ')[0])
                tag = line.split(' ')[1][:-1]
                if tag[:1] == 'E':
                    tag = 'I'+tag[1:]
                elif tag[:1] == 'S':
                    tag = 'B'+tag[1:]
                tags.append(tag)

In [147]:
for i,token in enumerate(tech_test['test']['tokens']):
    tech_test['test']['emb'].append(get_embedding(token,tokenizer,model))
    if i%100 == 0:
        print(i)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900


In [148]:
## Save pickle
with open("../data/ner/tech_test.pickle","wb") as fw:
    pickle.dump(tech_test, fw)

# ## Load pickle
# with open("../data/ner/tech_test.pickle","rb") as fr:
#     tech_test = pickle.load(fr)

# Householder Transformation

In [201]:
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [282]:
test_features_ht = {}
train_data = conll['train']
test_data = tech_test['test']


# preprocessing
## target all subtokens
train_all_subtokens = []
train_all_tokens = []
train_all_tags = []
train_subtoken_map = []
subtoken_length = 0
for sent_idx, tokens in enumerate(train_data['tokens']):
    train_all_tokens += tokens
    train_all_tags += train_data['tags'][sent_idx]
    subtoken_length += 1
    sent_subtoken = []
    for token in tokens:
        subtokens = tokenizer([token], return_tensors="pt",is_split_into_words=True)["input_ids"][0]
        subtokens = tokenizer.convert_ids_to_tokens(subtokens)
        train_subtoken_map.append(subtoken_length)
        subtoken_length += len(subtokens[1:-1])
        sent_subtoken += subtokens[1:-1]
    subtoken_length += 1
    train_all_subtokens += ['[CLS]'] + sent_subtoken + ['[SEP]']

## target all emb
train_all_emb = [token_emb for sent in train_data['emb'] for token_emb in sent]


    
## target all subtokens
test_all_subtokens = []
test_all_tokens = []
test_all_tags = []
test_subtoken_map = []
subtoken_length = 0
for sent_idx, tokens in enumerate(test_data['tokens']):
    test_all_tokens += tokens
    test_all_tags += test_data['tags'][sent_idx]
    subtoken_length += 1
    sent_subtoken = []
    for token in tokens:
        subtokens = tokenizer([token], return_tensors="pt",is_split_into_words=True)["input_ids"][0]
        subtokens = tokenizer.convert_ids_to_tokens(subtokens)
        test_subtoken_map.append(subtoken_length)
        subtoken_length += len(subtokens[1:-1])
        sent_subtoken += subtokens[1:-1]
    subtoken_length += 1
    test_all_subtokens += ['[CLS]'] + sent_subtoken + ['[SEP]']

## target all emb
test_all_emb = [token_emb for sent in test_data['emb'] for token_emb in sent]
  
    
# divide label by label
# map between token and subtoken embedding
for target_label in label_list:
    # divide and map source data
    train_features = []
    train_labels = []
    for tok_idx, tag in enumerate(train_all_tokens):
        map_idx = train_subtoken_map[tok_idx]
        emb = train_all_emb[map_idx]
        tag = train_all_tags[tok_idx]
        train_features.append(emb)
        if tag == target_label:
            train_labels.append(1)
        else:
            train_labels.append(0)
 
    # divide and map target data
    test_features = []
    test_labels = []
    for tok_idx, tag in enumerate(test_all_tokens):
        map_idx = test_subtoken_map[tok_idx]
        emb = test_all_emb[map_idx]
        tag = test_all_tags[tok_idx]
        test_features.append(emb)
        if tag == target_label:
            test_labels.append(1)
        else:
            test_labels.append(0)

    # transform to numpy
    train_features = np.array(train_features)
    train_labels = np.array(train_labels)
    test_features = np.array(test_features)

    # householder transformation
    test_features_ht[target_label] = ht_lr(train_features, train_labels, test_features)
    print(target_label)

O
B-PER
I-PER
B-ORG
I-ORG
B-LOC
I-LOC
B-MISC
I-MISC


In [283]:
print(len(train_features))
print(len(train_labels))
print(len(test_features))
print(len(test_labels))

203621
203621
54070
54070


In [284]:
## Save pickle
with open("../data/ner/test_features_ht.pickle","wb") as fw:
    pickle.dump(test_features_ht, fw)

# ## Load pickle
# with open("../data/ner/test_features_ht.pickle","rb") as fr:
#     test_features_ht = pickle.load(fr)

# label prediction

In [293]:
# with ht
prediction_ht = []
for index in range(len(test_features_ht['O'])):
    tag_prob_layer = []
    for tag_index, target_label in enumerate(label_list):
        token_emb = test_features_ht[target_label][index] # get ht-ed token embedding
        token_emb = torch.from_numpy(token_emb).float() # make it as torch
        logit = model.classifier(token_emb) # pass emb into finetuned classifier to get logit
        sm = torch.nn.Softmax(dim = 0) # pass logit into softmax layer
        softmax = sm(logit)
        tag_prob_layer.append(softmax.tolist()[tag_index]) # only take probability of target tag
#         tag_prob_layer.append(logit.tolist()[tag_index]) # only take logit of target tag
#     tag_prob_layer = torch.from_numpy(np.array(tag_prob_layer)).float() # transform to tensor
#     sm = torch.nn.Softmax(dim = 0)
#     output_layer = sm(tag_prob_layer) # pass logits to softmax
#     output_layer = output_layer.tolist()
#     predict_tag_index = np.argmax(output_layer)
    predict_tag_index = np.argmax(tag_prob_layer)
    predict_tag = label_list[predict_tag_index]
    prediction_ht.append(predict_tag)
    if index%10000 == 0:
        print(index)

0
10000
20000
30000
40000
50000


In [306]:
len([x for x in test_all_tags if x in ['I-LOC']])

140

In [290]:
print(classification_report([test_all_tags], [prediction_ht]))

              precision    recall  f1-score   support

         LOC       0.02      0.01      0.02       489
        MISC       0.09      0.15      0.11       365
         ORG       0.22      0.48      0.30       873
         PER       0.01      0.01      0.01      1094

   micro avg       0.13      0.17      0.15      2821
   macro avg       0.08      0.16      0.11      2821
weighted avg       0.09      0.17      0.11      2821



In [296]:
# without ht
tech_test_emb = [test_all_emb[i] for i in test_subtoken_map]
prediction = []
for index in range(len(tech_test_emb)):
    token_emb = tech_test_emb[index] # get ht-ed token embedding
    token_emb = torch.from_numpy(token_emb).float() # make it as torch
    logit = model.classifier(token_emb) # pass emb into finetuned classifier to get logit
    sm = torch.nn.Softmax(dim = 0) # pass logit into softmax layer
    softmax = sm(logit)
    predict_tag_index = np.argmax(softmax.tolist())
    predict_tag = label_list[predict_tag_index]
    prediction.append(predict_tag)
    if index%10000 == 0:
        print(index)

0
10000
20000
30000
40000
50000


In [297]:
print(classification_report([test_all_tags], [prediction]))

              precision    recall  f1-score   support

         LOC       0.01      0.02      0.01       489
        MISC       0.10      0.17      0.12       365
         ORG       0.01      0.02      0.01       873
         PER       0.00      0.00      0.00      1094

   micro avg       0.03      0.03      0.03      2821
   macro avg       0.03      0.05      0.04      2821
weighted avg       0.02      0.03      0.02      2821

