In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
from transformers import AutoTokenizer, AutoConfig
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import transformers
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import re
import random

import json
import nltk
import csv
from nltk.tokenize import word_tokenize

torch.__version__, torch.cuda.is_available(), transformers.__version__

('1.7.1+cu110', True, '4.2.2')

In [258]:
MAX_LEN = 100
BATCH_SIZE = 64
# PRETRAIN_MODEL = 'bert-base-cased' #'../input/pytorch-bert-ner-2/pytorch_bert_ner_model'
TRUNCATING_TYPE = 'pre'
PADDING_TYPE = 'post'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_gpu = torch.cuda.device_count()
print(device)
print(n_gpu)

# tokenizer = BertTokenizer.from_pretrained(PRETRAIN_MODEL)
tokenizer = AutoTokenizer.from_pretrained('C:\\Users\\stick\\kaggle\\tokenizer')
model = BertForTokenClassification.from_pretrained('C:\\Users\\stick\\kaggle\\pytorch_bert_ner_model_v2')

# model = BertForTokenClassification.from_pretrained('../input/pytorch-bert-ner/pytorch_bert_ner_model')
model = model.to(device)

tag_values = ['O', 'B-D', 'I-D', 'PAD']

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

def correct_word_broken(tu):
    for i in range(len(tu)):
        (token, tag) = tu[i]
        if i > 0:
            (previous_token, previous_tag) = tu[i-1]
            if previous_tag == 'B-D' and token.startswith('##'):
                tu[i] = (token, previous_tag)
    return tu

def remove_super_tags(tags):
    new_set = set()
    for s in tags:
        remove = False
        for ss in tags:
            if ss in s and s > ss:
                remove = True
                break
        if not remove:
            new_set.add(s)

    return new_set

def get_dataset_tags(tokenized_sentence, pred_tags):
    dataset_names = set()
    dataset_name = ''
    tu = correct_word_broken(list(zip(tokenized_sentence, pred_tags)))
#     for t in tu:
#         print(t)
    found_start = False
    for (token, tag) in tu:
        if not found_start and tag == 'B-D' and not token.startswith('##'): # Found the starting position
            dataset_name += token
            found_start = True
            continue
        
        if found_start:
            if tag == 'B-D' or tag == 'I-D':
                if token.startswith('##'):
                    dataset_name += token.replace('##', '')
                else:
                    dataset_name += ' ' + token
            else:
                found_start = False
                dataset_name = dataset_name.strip()
                if (dataset_name[0] >= 'A' and dataset_name[0] <= 'Z' or dataset_name[0] >= 'a' and dataset_name[0] <= 'z') and not dataset_name.startswith('and '): 
                    dataset_names.add(clean_text(dataset_name))
                    dataset_name = ''
    return dataset_names

def is_sentence_worth_predict(words):
    count = 0
    for w in words:
        if len(w) > 0 and w[0] >= 'A' and w[0] <= 'Z':
            count += 1
            if count >= 3:
                return True
            
    return False

cuda
1


In [242]:
test_df = pd.read_csv(
    './sentence_test.csv',
    index_col=None,
    header='infer',
    keep_default_na=False,
#     names=['fid', 'sid', 'sentence', 'labels', 'dedup_labels'],
    dtype={
        'fid': 'str',
        'sid': 'str', 
        'sentence': 'str',
        'labels': 'str',
        'dedup_labels': 'str'
    },
)

In [243]:
class ScoringDataset:
    def __init__(self, fids, sids, input_ids, masks):
        self.fids = fids
        self.sids = sids
        self.input_ids = input_ids
        self.masks = masks
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, item):
        b_fids = self.fids[item]
        b_sids = self.sids[item]
        b_input_ids = self.input_ids[item]
        b_masks = self.masks[item]
        
#         print(b_fids)
#         print(b_sids)
#         print(b_input_ids)
#         print(b_tokenized_sentences)
#         print(b_masks)

        return {
            "b_fids": b_fids,
            "b_sids": b_sids,
            "b_input_ids": torch.tensor(b_input_ids, dtype=torch.long),
            "b_masks": torch.tensor(b_masks, dtype=torch.float)
        }

In [244]:
# Prepare bert tokenized sentences
tokenized_sentences = []
fids = []
sids = []

for index, row in test_df.iterrows():
    fid = row['fid']
    sid = row['sid']
    sentence = row['sentence']
    tokenized_sentence = tokenizer.tokenize(sentence)
    tokenized_sentences.append(tokenized_sentence)
    fids.append(fid)
    sids.append(sid)

input_ids = pad_sequences(
    [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_sentences], 
    maxlen=MAX_LEN, 
    dtype='long', 
    value=0.0, 
    truncating=TRUNCATING_TYPE, 
    padding=PADDING_TYPE
)

attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

scoring_dataset = ScoringDataset(
    fids=fids,
    sids=sids,
    input_ids=input_ids,
    masks=attention_masks,
)

scoring_dataloader = DataLoader(
    scoring_dataset,
    batch_size=BATCH_SIZE,
)

Token indices sequence length is longer than the specified maximum sequence length for this model (1506 > 512). Running this sequence through the model will result in indexing errors


In [245]:
len(scoring_dataloader), len(test_df)

(755, 48312)

In [246]:
for d in scoring_dataloader:
    print(d)
    break

{'b_fids': ['dc107d04-1082-426a-b616-e755486c8627', '430aa11c-0283-411b-8edc-08f5df3db258', '14848a9b-13a7-41c3-b49f-ba725695e497', '5c3de175-0bcf-4165-a5c3-c3ced789d2f6', 'da369ee6-2d60-402c-b3bf-05f44df77c78', 'c342d0c4-abe5-4c33-af32-257ef120d932', '7fd5e4c7-4577-4b2a-8f3f-2849ccb92ec6', '5509b730-b7d5-459f-aa6b-72d84a81e97f', '170113f9-399c-489e-ab53-2faf5c64c5bc', '81eaf522-1f9f-48b6-9cfa-8e586b8d664a', '930f4366-2fcd-49a8-b75e-6b0864cb2b3e', '4a2bd823-e642-4ad9-b8e6-796ae4db9734', '170113f9-399c-489e-ab53-2faf5c64c5bc', 'ccb0c3a8-9492-4c28-aa38-25a1fb40d6d5', 'be69c37f-95de-429e-ba8c-70a698fcae03', '5f735753-787c-4c42-bd04-877f8a8c5464', '0cbde738-c072-47e7-92b5-b6ccb5661f2d', '5ef54d76-d304-4fd3-8ea3-cc89ea261405', '0d939d09-bd11-4c97-840e-43076458f11d', 'd8621959-e8cd-479d-9182-b5d741de5b8d', 'e1aac6e3-c1d0-498c-a2bd-a7b97464590d', '34a240e7-bd70-4d96-a9f6-007db33d6bd8', 'a282c913-0477-442d-8006-690ca3b5e34c', 'e63cf45b-4da7-49d1-8d55-698f4fbb9bd7', '61bdb1f4-b3ea-4d12-b4b3-400

In [272]:
def predict_dl(scoring_dataloader):
    predicted_tags = []
    print(f'In total {len(scoring_dataloader)} batches to process')
    processed = 0
    model.eval()
    for batch in scoring_dataloader:
        b_fids = batch['b_fids']
        b_sids = batch['b_sids']
        b_input_ids = batch['b_input_ids'].to(device).to(torch.int64)
        b_input_masks = batch['b_masks'].to(device).to(torch.int64)
        
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks)
        logits = outputs[0].detach().cpu().numpy()
#         print(logits)
        predictions = [list(p) for p in np.argmax(logits, axis=2)]
#         print(predictions)
        pred_tags = []
        for r in predictions:
            pred_tags.append([tag_values[ri] for ri in r])
    
        for fid, sid, pt in zip(b_fids, b_sids, pred_tags):
            sentence = test_df[test_df['sid']==sid]['sentence'].values[0]
            if 'B-D' in pt or 'I-D' in pt:
                tokenized_sentence = tokenizer.tokenize(sentence)
                dataset_names = get_dataset_tags(tokenized_sentence, pt)
                predicted_tags.append((fid, sid, '|'.join(dataset_names)))
            else:
                if "Alzheimer's Disease Neuroimaging Initiative (ADNI)" in sentence:
                    predicted_tags.append((fid, sid, clean_text("Alzheimer's Disease Neuroimaging Initiative (ADNI)")))
                elif 'ADNI' in sentence:
                    predicted_tags.append((fid, sid, 'adni'))
                else:
                    predicted_tags.append((fid, sid, ''))
                
        processed += 1
        if processed % 100 == 0:
            print(f'{processed} processed')
            
    return predicted_tags

In [270]:
def predict_df(test_df, debug=False):
    predicted_tags = []
    print(f'In total {len(test_df)} sentences to process')
    processed = 0
    for index, row in test_df.iterrows():
        sentence = row['sentence']
        
        words = word_tokenize(sentence)
#         if not is_sentence_worth_predict(words):
#             continue  
        tokenized_sentence = []
        for w in words:
            tokenized_word = tokenizer.tokenize(w)
            tokenized_sentence.extend(tokenized_word)

        input_ids = pad_sequences(
            [tokenizer.convert_tokens_to_ids(tokenized_sentence)], 
            maxlen=MAX_LEN, 
            dtype='long', 
            value=0.0, 
            truncating=TRUNCATING_TYPE, 
            padding=PADDING_TYPE
        )

        attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

        input_ids = torch.tensor(input_ids, dtype=torch.long)
        attention_masks = torch.tensor(attention_masks, dtype=torch.float)

        model.eval()
        b_input_ids = input_ids.to(device).to(torch.int64)
        b_input_mask = attention_masks.to(device).to(torch.int64)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs[0].detach().cpu().numpy()
        predictions = np.argmax(logits, axis=2)[0]
        pred_tags = [tag_values[w] for w in predictions]
        if debug:
            for p1, p2 in zip(tokenized_sentence, pred_tags):
                print(p1, p2)
        if 'B-D' in pred_tags or 'I-D' in pred_tags:
            dataset_names = get_dataset_tags(tokenized_sentence, pred_tags)
            predicted_tags.append('|'.join(dataset_names))
#             pred_dataset_tags.update(dataset_names)
#             print(f'{s}: {dataset_names}')
        else:
            if "Alzheimer's Disease Neuroimaging Initiative (ADNI)" in sentence:
                predicted_tags.append((fid, sid, clean_text("Alzheimer's Disease Neuroimaging Initiative (ADNI)")))
            elif 'ADNI' in sentence:
                predicted_tags.append((fid, sid, 'adni'))
            else:
                predicted_tags.append((fid, sid, ''))
        
        processed += 1
        if processed % 1000 == 0:
            print(f'{processed} processed')
      
    return predicted_tags
#     return remove_super_tags(pred_dataset_tags)

In [271]:
import time

start = time.time()
pred_v1 = predict_df(test_df)
end = time.time()
print(f'dataframe verion runtime: {end-start}')

In total 48312 sentences to process
1000 processed
2000 processed
3000 processed
4000 processed
5000 processed
6000 processed
7000 processed
8000 processed
9000 processed
10000 processed
11000 processed
12000 processed
13000 processed
14000 processed
15000 processed
16000 processed
17000 processed
18000 processed
19000 processed
20000 processed
21000 processed
22000 processed
23000 processed
24000 processed
25000 processed
26000 processed
27000 processed
28000 processed
29000 processed
30000 processed
31000 processed
32000 processed
33000 processed
34000 processed
35000 processed
36000 processed
37000 processed
38000 processed
39000 processed
40000 processed
41000 processed
42000 processed
43000 processed
44000 processed
45000 processed
46000 processed
47000 processed
48000 processed
dataframe verion runtime: 774.8628721237183
In total 755 batches to process


UnboundLocalError: local variable 'sentence' referenced before assignment

In [273]:
start = time.time()
pred_v2 = predict_dl(scoring_dataloader)
end = time.time()
print(f'dataloader verion runtime: {end-start}')

test_df['predicted'] = pred_v1

cleaned_dedup_labels = []
for index, row in test_df.iterrows():
    dedup_labels = row['dedup_labels']
    if dedup_labels == '':
        cleaned_dedup_labels.append(dedup_labels)
    else:
        dedup_label_list = dedup_labels.split('||')
        cleaned_dedup_labels.append('|'.join([clean_text(dl) for dl in dedup_label_list]))
test_df['cleaned_dedup_labels'] = cleaned_dedup_labels

pred_v2_df = pd.DataFrame.from_records(pred_v2, columns =['fid', 'sid', 'pred_v2'])
combined = pd.merge(test_df, pred_v2_df, on=["fid", "sid"])

In total 755 batches to process
100 processed
200 processed
300 processed
400 processed
500 processed
600 processed
700 processed
dataloader verion runtime: 313.8321304321289


In [278]:
has_results = combined[combined['cleaned_dedup_labels'] != '']
same = has_results[has_results.cleaned_dedup_labels == has_results.pred_v2]
diff = has_results[has_results.cleaned_dedup_labels != has_results.pred_v2]

In [279]:
print(len(same)/(len(same)+len(diff)), len(same), len(diff))

0.9257270693512304 8276 664


In [214]:
pred_v2_df[pred_v2_df['sid'] == 'S410090']

Unnamed: 0,fid,sid,pred_v2
42065,d2be42c9-2895-4ca8-8ab1-9079123e2984,S410090,


In [216]:
len(combined[combined.predicted == combined.pred_v2])

48308

In [69]:
diff[diff['sid'] == 'S341009'].cleaned_dedup_labels.tolist()[0]

'international best track archive for climate stewardship|ibtracs'

In [280]:
diff.sample(100)

Unnamed: 0,fid,sid,sentence,labels,dedup_labels,predicted,cleaned_dedup_labels,pred_v2
26309,12620b5b-ca11-40fb-880b-096fff3d94c4,S604797,"Of them, the main are: the Baltimore Longitudi...",Baltimore Longitudinal Study of Aging||Baltimo...,Baltimore Longitudinal Study of Aging (BLSA),,baltimore longitudinal study of aging blsa,
21490,c08d58e8-a21b-4e8a-b816-8923251ea9c2,S697761,We also examined ASA children's health and dev...,Early Childhood Longitudinal Study,Early Childhood Longitudinal Study,"(4ef2b977-2e2d-41f5-8ed6-3f44c691f620, S794046, )",early childhood longitudinal study,
25926,6e3137f2-1cd7-47ba-b1c4-b32e242aac58,S1953813,The following data sets for the period 1982-20...,Optimum Interpolation Sea Surface Temperature,Optimum Interpolation Sea Surface Temperature,"(4ef2b977-2e2d-41f5-8ed6-3f44c691f620, S794046, )",optimum interpolation sea surface temperature,
44315,4a2bd823-e642-4ad9-b8e6-796ae4db9734,S3030914,Survey of Earned DoctoratesSex; age; race-ethn...,Survey of Earned Doctorates,Survey of Earned Doctorates,"(4ef2b977-2e2d-41f5-8ed6-3f44c691f620, S794046, )",survey of earned doctorates,
13780,66a3dbe4-352b-4dad-a492-1e039df41520,S2522772,The TC record is taken from the International ...,IBTrACS,IBTrACS,ibtracs|international best track archive for c...,ibtracs,ibtracs|international best track archive for c...
...,...,...,...,...,...,...,...,...
2073,676acdae-e2f1-498a-9d40-cea8b531ea67,S2353187,"As in our prior work , we used tensor-based mo...",ADNI,ADNI,,adni,
7798,2cc7485e-194d-41d4-8359-a6446cc50f75,S2517292,The filling rate of TCs after making landfall ...,IBTrACS,IBTrACS,,ibtracs,
29013,147b87df-e46c-4d8a-a03a-00d7da262ac0,S3084963,"This report documents the design, development,...",Early Childhood Longitudinal Study,Early Childhood Longitudinal Study,"(4ef2b977-2e2d-41f5-8ed6-3f44c691f620, S794046, )",early childhood longitudinal study,
29355,8bd335ea-1351-4fba-99f8-53669c3c921d,S2083516,[31] [32] [33] All C-11 Pittsburgh compound B ...,ADNI,ADNI,pittsburgh compound,adni,pittsburgh compound


In [268]:
combined[combined['sid']=='S1119852'].sentence.values[0]

"Model performance is evaluated by using a simulation study and two sets of data of Alzheimer's disease patients (one from the memory-clinic-based Amsterdam Dementia Cohort and one from the Alzheimer's Disease Neuroimaging Initiative (ADNI) database)."

In [235]:
sample_df = combined[combined['sid']=='S6759109']
sample_df

Unnamed: 0,fid,sid,sentence,labels,dedup_labels,predicted,cleaned_dedup_labels,pred_v2
46399,f53dcc84-0b71-4efa-b434-625fd0596226,S6759109,The researchers noticed the genome sequence of...,genome sequence of SARS-CoV-2,genome sequence of SARS-CoV-2,,genome sequence of sars cov 2,


In [264]:
single = combined[combined['sid']=='S3351012']
r = predict_df(single,debug=True)
r

In total 1 sentences to process
Relations O
##hip O
among O
weights O
, O
universe O
flags B-D
, I-D
populations I-D
, I-D
and I-D
respond I-D
##ents O
: O
2002 O
– O
06 O
Weight O
B O
##Y O
##ST O
##U O
##W O
##T O
B O
##Y O
##EX O
##P O
##W O
##T O
F1 O
##P O
##NL O
##W O
##T O
Universe O
flag O
G O
##10 O
##CO O
##H O
##RT O
G O
##10 O
##CO O
##H O
##RT O
G O
##10 O
##CO O
##H O
##RT O
Population O
A O
— O
Spring O
2002 O
10th O
- O
grade O
##r O
A O
— O
Spring O
2002 O
10th O
- O
grade O
##r O
A O
— O
Spring O
2002 O
10th O
- O
grade O
##r O
Re O
##sp O
##ond O
##ent O
Full O
##y O
or O
partially O
completed O
question O
##naire O
in O
2002 O
Full O
##y O
or O
partially O
completed O
question O
##naire O
in O
2002 O
or O


['flags populations and respond']

Unnamed: 0,fid,sid,sentence,labels,dedup_labels,predicted,cleaned_dedup_labels,pred_v2
0,dc107d04-1082-426a-b616-e755486c8627,S2682021,SeaWiFS ORM-derived g i ͑443͒ as a function of...,,,,,
1,430aa11c-0283-411b-8edc-08f5df3db258,S4365895,Foreign direct investment (FDI): Ownership or ...,,,,,
2,14848a9b-13a7-41c3-b49f-ba725695e497,S772899,"As Sorokin (1959, p. 8) wrote, 2 At the presen...",,,,,
3,5c3de175-0bcf-4165-a5c3-c3ced789d2f6,S1088866,worth noting when using information from clini...,ADNI,ADNI,adni,adni,adni
4,da369ee6-2d60-402c-b3bf-05f44df77c78,S856664,The authors presented an electroencephalogram ...,,,,,
...,...,...,...,...,...,...,...,...
48307,7c3a786f-9fae-4b32-a036-603b96f98354,S1955800,The first mode (CEOF1) is the deep overturning...,,,,,
48308,4e07d309-3f36-427c-95c5-89d3d88ad5e1,S1404663,"The model is additive, which means in practice...",,,,,
48309,a5e4464b-1ad9-4890-a411-8d84de0aadaa,S1088230,"(2015) , who found that APOE was not significa...",,,,,
48310,a3698b97-2893-49a2-ade8-f53d1fef069b,S2002726,The disease diagnosis machine can therefore be...,,,,,
