In [7]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
from transformers import AutoTokenizer, AutoConfig
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import transformers
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import re
import random

import json
import nltk
import csv
from nltk.tokenize import word_tokenize

torch.__version__, torch.cuda.is_available(), transformers.__version__

('1.7.0', True, '4.5.1')

In [8]:
MAX_LEN = 100
BATCH_SIZE = 32
# PRETRAIN_MODEL = 'bert-base-cased' #'../input/pytorch-bert-ner-2/pytorch_bert_ner_model'
TRUNCATING_TYPE = 'pre'
PADDING_TYPE = 'post'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_gpu = torch.cuda.device_count()
print(device, n_gpu)

# tokenizer = BertTokenizer.from_pretrained(PRETRAIN_MODEL)
tokenizer = AutoTokenizer.from_pretrained('../input/tokenizer/tokenizer')
model = BertForTokenClassification.from_pretrained('../input/pytorch-bert-ner-model-v2/pytorch_bert_ner_model_v2')

# model = BertForTokenClassification.from_pretrained('../input/pytorch-bert-ner/pytorch_bert_ner_model')
model = model.to(device)

tag_values = ['O', 'B-D', 'I-D', 'PAD']

# Kaggle provided func to clean the dataset names
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

def correct_word_broken(tu):
    for i in range(len(tu)):
        (token, tag) = tu[i]
        if i > 0:
            (previous_token, previous_tag) = tu[i-1]
            if previous_tag == 'B-D' and token.startswith('##'):
                tu[i] = (token, previous_tag)
    return tu

def remove_super_tags(tags):
    new_set = set()
    for s in tags:
        remove = False
        for ss in tags:
            if ss in s and s > ss:
                remove = True
                break
        if not remove:
            new_set.add(s)

    return new_set

def get_dataset_tags(tokenized_sentence, pred_tags):
    dataset_names = set()
    dataset_name = ''
    tu = correct_word_broken(list(zip(tokenized_sentence, pred_tags)))
    found_start = False
    for (token, tag) in tu:
        if not found_start and tag == 'B-D' and not token.startswith('##'): # Found the starting position
            dataset_name += token
            found_start = True
            continue
        
        if found_start:
            if tag == 'B-D' or tag == 'I-D':
                if token.startswith('##'):
                    dataset_name += token.replace('##', '')
                else:
                    dataset_name += ' ' + token
            else:
                found_start = False
                dataset_name = dataset_name.strip()
                if len(dataset_name) > 0 and (dataset_name[0] >= 'A' and dataset_name[0] <= 'Z' or dataset_name[0] >= 'a' and dataset_name[0] <= 'z') and not dataset_name.startswith('and '):    
                    dataset_names.add(clean_text(dataset_name))
                    dataset_name = ''
    return dataset_names

def is_sentence_worth_predict(words):
    count = 0
    for w in words:
        if len(w) > 0 and w[0] >= 'A' and w[0] <= 'Z':
            count += 1
            if count >= 3:
                return True
            
    return False

cuda 1


In [4]:
# To reduce the scoring time, use pytorch's dataloader to load in sentences and use batch model for prediction
class ScoringDataset:
    def __init__(self, fids, sids, input_ids, masks):
        self.fids = fids
        self.sids = sids
        self.input_ids = input_ids
        self.masks = masks
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, item):
        b_fids = self.fids[item]
        b_sids = self.sids[item]
        b_input_ids = self.input_ids[item]
        b_masks = self.masks[item]
        
#         print(b_fids)
#         print(b_sids)
#         print(b_input_ids)
#         print(b_tokenized_sentences)
#         print(b_masks)

        return {
            "b_fids": b_fids,
            "b_sids": b_sids,
            "b_input_ids": torch.tensor(b_input_ids, dtype=torch.long),
            "b_masks": torch.tensor(b_masks, dtype=torch.float)
        }

In [16]:
# This func takes dataloader as input for prediction
def predict_dl(scoring_dataloader, sentence_df):
    predicted_tags = []
    print(f'In total {len(scoring_dataloader)} batches to process')
    processed = 0
    model.eval()
    for batch in scoring_dataloader:
        b_fids = batch['b_fids']
        b_sids = batch['b_sids']
        b_input_ids = batch['b_input_ids'].to(device).to(torch.int64)
        b_input_masks = batch['b_masks'].to(device).to(torch.int64)
        
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks)
        logits = outputs[0].detach().cpu().numpy()
#         print(logits)
        predictions = [list(p) for p in np.argmax(logits, axis=2)]
#         print(predictions)
        pred_tags = []
        for r in predictions:
            pred_tags.append([tag_values[ri] for ri in r])
    
        for fid, sid, pt in zip(b_fids, b_sids, pred_tags):
            sentence = sentence_df[sentence_df['sid']==sid]['sentence'].values[0]
            if 'B-D' in pt or 'I-D' in pt:
                tokenized_sentence = tokenizer.tokenize(sentence)
                dataset_names = get_dataset_tags(tokenized_sentence, pt)
                predicted_tags.append((fid, sid, '|'.join(dataset_names)))
            else:
                if "Alzheimer's Disease Neuroimaging Initiative (ADNI)" in sentence:
                    predicted_tags.append((fid, sid, clean_text("Alzheimer's Disease Neuroimaging Initiative (ADNI)")))
                elif 'ADNI' in sentence:
                    predicted_tags.append((fid, sid, 'adni'))
                else:
                    predicted_tags.append((fid, sid, ''))
                
        processed += 1
        if processed % 100 == 0:
            print(f'{processed} processed')
     
    print('All batches processed')
    return predicted_tags

In [17]:
def run(sentences):
    sentence_df = pd.DataFrame.from_records(sentences, columns =['fid', 'sid', 'sentence'])
    
    # Prepare bert tokenized sentences
    tokenized_sentences = []
    fids = []
    sids = []

    for index, row in sentence_df.iterrows():
        fid = row['fid']
        sid = row['sid']
        sentence = row['sentence']
        tokenized_sentence = tokenizer.tokenize(sentence)
        tokenized_sentences.append(tokenized_sentence)
        fids.append(fid)
        sids.append(sid)

    input_ids = pad_sequences(
        [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_sentences], 
        maxlen=MAX_LEN, 
        dtype='long', 
        value=0.0, 
        truncating=TRUNCATING_TYPE, 
        padding=PADDING_TYPE
    )

    attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

    scoring_dataset = ScoringDataset(
        fids=fids,
        sids=sids,
        input_ids=input_ids,
        masks=attention_masks,
    )

    scoring_dataloader = DataLoader(
        scoring_dataset,
        batch_size=BATCH_SIZE,
    )
    
    pred = predict_dl(scoring_dataloader, sentence_df)
    pred_df = pd.DataFrame.from_records(pred, columns =['fid', 'sid', 'predicted'])
    combined = pd.merge(sentence_df, pred_df, on=["fid", "sid"])
    
    for index, row in combined.iterrows():
        fid = row['fid']
        predicted = row['predicted']
        if fid not in result_dict:
            result_dict[fid] = set()

        if predicted is not None and predicted > '':
            result_dict[fid].add(predicted)

In [18]:
import os
import json

sentences = []
sid = 0
result_dict = {}
for dirname, _, filenames in os.walk('/kaggle/input/coleridgeinitiative-show-us-the-data/test'):
    for filename in filenames:
        file_path = os.path.join(dirname, filename)
        fid = filename.replace('.json', '')
        print(f'====Processing {filename}====')
        with open(file_path) as json_file:
            data = json.load(json_file)
        
            for section in data:
                text = section['text']
                section_sentences = nltk.sent_tokenize(text)
                for sentence in section_sentences:
                    sentences.append((fid, f'S{sid}', sentence))
                    sid += 1
                    if len(sentences) >= BATCH_SIZE*1000:  # You can store all testing sentences in one go so break up
                        run(sentences)
                        sentences = []
                        
    if len(sentences) > 0:
        run(sentences)

====Processing 8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60.json====
====Processing 2100032a-7c33-4bff-97ef-690822c43466.json====
====Processing 2f392438-e215-4169-bebf-21ac4ff253e1.json====
====Processing 3f316b38-1a24-45a9-8d8c-4e05a42257c6.json====
In total 64 batches to process
All batches processed


In [19]:
with open('./submission.csv', 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    writer.writerow(['Id','PredictionString'])
    for key, value in result_dict.items():
        writer.writerow([key, '|'.join(value)])
print('All done')

All done
