### I tried the BASIC BERT model and tune the bias, the accuracy is 0.6. In this model, the tokenized sentence is : [sep] sentence1 [sep] sentence2 [sep]. One potential change could be [sep] sentence1 [sep] word [sep] sentence2 [sep]. But tokenize three sentences needed to be done by hand. I also believe there is another form of BERT model

In [79]:
import json
import torch
import transformers as tf
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

import warnings 
warnings.filterwarnings('ignore')

In [2]:
with open('./data/train.jsonl', 'r') as js_file:
    js_list = list(js_file)
line=js_list[0]
sample=json.loads(line)
sample

{'word': 'place',
 'sentence1': 'Do you want to come over to my place later?',
 'sentence2': 'A political system with no place for the less prominent groups.',
 'idx': 0,
 'label': False,
 'start1': 31,
 'start2': 27,
 'end1': 36,
 'end2': 32,
 'version': 1.1}

In [33]:
'''
Some settings
Load sentence in to numpy. The symbol '+' here denotes seperation
'''
label=np.array([json.loads(i)["label"] for i in js_list]) #load label
sen1=np.array([json.loads(i)["sentence1"] for i in js_list])
sen2=np.array([json.loads(i)["sentence2"] for i in js_list])
sep=np.array([' + ']*len(sen1))# sep array
sen_full=np.char.add(sen1,sep)
sen_full=np.char.add(sen_full,sen2)
words=np.array([json.loads(i)["word"] for i in js_list])

In [36]:
sen_extra=np.char.add(sen1,sep)
sen_extra=np.char.add(sen_extra,words)
sen_extra=np.char.add(sen_extra,sep)
sen_extra=np.char.add(sen_extra,sen1)
sen_extra[0]

'Do you want to come over to my place later? + place + Do you want to come over to my place later?'

In [81]:
#Load pre trained BERT in the package transformer
model_class, tokenizer_class, pretrained_weights=(tf.BertModel, tf.BertTokenizer, 'bert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [82]:
'''
Tokenize two sentences with int
'''
tokenized_sen=np.array([tokenizer.encode\
                        (sen1[i],sen2[i],add_special_tokens=True) \
                        for i in range(5000)])
tokenized_sen_extra=np.array([tokenizer.encode\
                              (sen_extra[i],add_special_tokens=True) \
                        for i in range(5000)])

In [6]:
'''
Padding
Align the length to the max length
'''
def find_max(npa):
    max_len = 0
    for i in npa:
        if len(i) > max_len:
            max_len = len(i)
    return(max_len)
def padding(npa):
    max_len=find_max(npa)
    return (np.array([i + [0]*(max_len-len(i)) for i in npa]))

In [7]:
token_sen_afp=padding(tokenized_sen)
mask = np.where(token_sen_afp != 0, 1, 0)#1 is non padding part 0 o.w
inp = torch.tensor(token_sen_afp).to(torch.int64)#convert to tensor and in64 for the BERT transformer
mask = torch.tensor(mask)

In [88]:
'''
I export the features as txt, unannotate the code if you want to run it. 
Require at least 4GB RAM and I cost about half an hour to run.
'''

"""
with torch.no_grad():
    outcome = model(inp,mask)

features = outcome[0][:,0,:].numpy()

np.savetxt('CLS', features)
"""

features=np.loadtxt('CLS')

In [89]:
features.shape

(5000, 768)

## Logistics Regression for Train

In [198]:
'Regularization '
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(features, label)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 5.263252631578947}
best scrores:  0.6532


In [207]:
c=grid_search.best_params_['C']
c

5.263252631578947

In [208]:
lgR = LogisticRegression(penalty='l2',C=c,max_iter=1000,fit_intercept = False)#here's our train
lgR.fit(features, label)

LogisticRegression(C=5.263252631578947, fit_intercept=False, max_iter=1000)

In [210]:
lgR.score(features,label)

0.7662

## DEV

In [55]:
with open('./data/dev.jsonl', 'r') as test_file:
    test_list = list(test_file)
t_line=test_list[0]
sample=json.loads(t_line)
sample

{'word': 'pull',
 'sentence1': 'The pull of the current.',
 'sentence2': 'Iron fillings drawn by the pull of a magnet.',
 'idx': 5000,
 'label': True,
 'start1': 4,
 'start2': 27,
 'end1': 8,
 'end2': 31,
 'version': 1.1}

In [59]:
with open('./data/test.jsonl', 'r') as dev_file:
    dev_list = list(dev_file)
dev_line=dev_list[0]
devsample=json.loads(dev_line)
devsample

{'word': 'class',
 'sentence1': 'An emerging professional class.',
 'sentence2': 'Apologizing for losing your temper, even though you were badly provoked, showed real class.',
 'idx': 0,
 'label': False,
 'start1': 25,
 'start2': 85,
 'end1': 30,
 'end2': 90,
 'version': 1.1}

In [60]:
dsen1=np.array([json.loads(i)["sentence1"] for i in dev_list])
dsen2=np.array([json.loads(i)["sentence2"] for i in dev_list])
dev_label=np.array([json.loads(i)["label"] for i in dev_list]) 

In [95]:
dev_tokenized_sen=np.array([tokenizer.encode\
                        (dsen1[i],dsen2[i],add_special_tokens=True) \
                        for i in range(len(dsen1))])
dev_token_sen_afp=padding(dev_tokenized_sen)
dev_mask = np.where(dev_token_sen_afp != 0, 1, 0)
dev_inp = torch.tensor(dev_token_sen_afp).to(torch.int64)
dev_mask = torch.tensor(dev_mask)

In [97]:
with torch.no_grad():
    dev_outcome = model(dev_inp,dev_mask)

In [99]:
dev_features = dev_outcome[0][:,0,:].numpy()

In [232]:
def AW2S(features,label,threshold,model): #another way to score
    a=(model.predict_proba(features)[:,1]>threshold) == label
    b=pow(a,2)
    return(sum(b)/len(b))

def searchBias(linespace,features,label,model):
    maxscore=0
    maxbias=0
    for i in linespace:
        score=AW2S(features,label,i,model)
        if(score>maxscore):
            maxscore=score
            maxbias=i
    print('MaxScore is ',maxscore)
    return(maxbias)

sg=np.linspace(0.0001, 1, 50)

bias=searchBias(sg,dev_features,dev_label,lgR)

MaxScore is  0.6018808777429467


## TEST

In [15]:
tsen1=np.array([json.loads(i)["sentence1"] for i in test_list])
tsen2=np.array([json.loads(i)["sentence2"] for i in test_list])
test_label=np.array([json.loads(i)["label"] for i in test_list]) 

In [112]:
test_tokenized_sen=np.array\
([tokenizer.encode(tsen1[i],tsen2[i],add_special_tokens=True) for i in range(len(tsen1))]);

In [17]:
test_token_sen_afp=padding(test_tokenized_sen)
test_mask = np.where(test_token_sen_afp != 0, 1, 0)
test_inp = torch.tensor(test_token_sen_afp).to(torch.int64)
test_mask = torch.tensor(test_mask)

In [18]:
test_inp.shape

torch.Size([638, 60])

In [19]:
with torch.no_grad():
    test_outcome = model(test_inp,test_mask)

In [20]:
test_features = test_outcome[0][:,0,:].numpy()

In [213]:
lgR.score(test_features,test_label)#with BERT model, no-tunning, the best accuracy is about 60%

0.5893416927899686

In [233]:
AW2S(test_features,test_label,bias,lgR)

0.6018808777429467