In [None]:
 # This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
!pip install rich

Collecting rich
  Downloading rich-10.2.0-py3-none-any.whl (203 kB)
[K     |████████████████████████████████| 203 kB 4.5 MB/s eta 0:00:01
Collecting commonmark<0.10.0,>=0.9.0
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 4.0 MB/s  eta 0:00:01
Installing collected packages: commonmark, rich
Successfully installed commonmark-0.9.1 rich-10.2.0


In [5]:
import numpy as np # linear algebra
import pandas as pd
import json
import re
import os
import pandas as pd
import pickle
from tqdm import tqdm,trange
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score
import torch
from transformers import BertForTokenClassification, AdamW, BertTokenizerFast
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
#from rich.console import Console
from rich.progress import track
from tqdm import tqdm
from transformers import BertTokenizerFast

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [8]:
test_csv_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
test_path  = '../input/coleridgeinitiative-show-us-the-data/test'
model_path = '../input/showusdata2'
tokenizer_path = '../input/huggingface-bert/bert-base-cased'

In [89]:
class SubmitPred:
    
    def __init__(self,test_csv_path,test_path,model_path,tokenizer_path,batch_size=64):
        self.test_csv_path = test_csv_path
        self.test_path = test_path
        self.model = BertForTokenClassification.from_pretrained(model_path,num_labels=3,output_attentions=False,output_hidden_states=False)
        self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path,do_lower_case = False)
        self.MAX_LENGTH = 64 # max no. words for each sentence.
        self.OVERLAP = 20 
        self.tag2str = {2:'O',1:'I',0:'B'}
        self.batch_size =batch_size
        
    def load_submission(self):
        self.sample_submission = pd.read_csv(self.test_csv_path)
    
    def tokenize_sent(self,sentence):
        tokenized_sentence = []
        sentence = sentence.split()
        for word in sentence:
            tokenized_word = self.tokenizer.tokenize(word)
            tokenized_sentence.extend(tokenized_word)
        return tokenized_sentence
    @staticmethod
    def shorten_sentences(sentences, max_len, overlap):
        short_sentences = []
        for sentence in sentences:
            words = sentence.split()
            if len(words) > max_len:
                for p in range(0, len(words), max_len - overlap):
                    short_sentences.append(' '.join(words[p:p + max_len]))
            else:
                short_sentences.append(sentence)
        return short_sentences
    
    @staticmethod
    def clean_training_text(txt):
        return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()
    
    def add_padding(self,tokenized_sentences):
        padded_sentences = pad_sequences(
            tokenized_sentences, 
            value='[PAD]', 
            dtype=object, 
            maxlen=self.MAX_LENGTH, 
            truncating='post', 
            padding='post')
        return padded_sentences
    @staticmethod
    def get_attention_mask(input_ids, ignore_tokens=[0,101,102]):
        return [[float(token not in ignore_tokens) for token in sent ] for sent in input_ids]
    
    
    @staticmethod
    def jaccard_similarity(s1, s2):
        l1 = s1.split(" ")
        l2 = s2.split(" ")    
        intersection = len(list(set(l1).intersection(l2)))
        union = (len(l1) + len(l2)) - intersection
        return float(intersection) / union
    
    @staticmethod
    def add_start_end_tokens(tupled_sentence):
        tupled_sentence.insert(0, ('[CLS]'))
        tupled_sentence.append(('[SEP]'))
        return tupled_sentence
    
    def run(self):
        self.load_submission()
        self.model.cuda()
        paper_length = []
        sentences_e =  []
        papers = {}
        for paper_id in self.sample_submission['Id']:
            with open(f'{self.test_path}/{paper_id}.json', 'r') as f:
                paper = json.load(f)
                papers[paper_id] = paper
        for id in self.sample_submission['Id']:
            paper = papers[id]
            sentences = set([self.clean_training_text(sentence) for section in paper for sentence in section['text'].split('.')])
            sentences = self.shorten_sentences(sentences,self.MAX_LENGTH,self.OVERLAP)
            sentences = [sentence for sentence in sentences if len(sentence) > 10]
            ner_data = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
            sentences_e.extend(ner_data)
            paper_length.append(len(ner_data))
        tokenized_words= [self.tokenize_sent(sentence) for sentence in sentences_e]
        start_end = [self.add_start_end_tokens(sentence) for sentence in tokenized_words]
        padding_sentences =  self.add_padding(start_end)
        input_ids = [self.tokenizer.convert_tokens_to_ids(text) for text in padding_sentences]
        attention_mask = self.get_attention_mask(input_ids,ignore_tokens=[0,101,102])
        predicts = torch.tensor(input_ids)
        masks    = torch.tensor(attention_mask)
        predict_data = TensorDataset(predicts, masks)
        predict_dataloader = DataLoader(predict_data, batch_size=self.batch_size)
        all_predictions = []
        for step,batch in enumerate(predict_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask = batch
            with torch.no_grad():
                output = self.model(b_input_ids,attention_mask=b_input_mask)
            label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
            all_predictions.extend(label_indices)
        
        all_preds_str = [[self.tag2str[token]for token in pred] for pred in all_predictions]
        all_sent_str = [self.tokenizer.convert_ids_to_tokens(sent) for sent in input_ids]
        all_sent_int = [ids for ids in input_ids]
        final_predics = []
        all_sent_str_1 = all_sent_int
        all_preds_str_1 = all_preds_str
        for pap_len in paper_length:
            labels = []
            for sentence,pred in zip(all_sent_str_1[:pap_len],all_preds_str_1[:pap_len]):
                phrase = []
                for word,tag in zip(sentence,pred):
                    if tag =="I" or tag =="B":
                        phrase.append(word)
                    else:
                        if len(phrase)!=0:
                            labels.append(self.tokenizer.decode(phrase))
                            phrase = []

            final_predics.append(labels)
            del all_sent_str_1[:pap_len], all_preds_str_1[pap_len]
        self.sample_submission['PredictionString'] = final_predics
        self.sample_submission['PredictionString'] = self.sample_submission.apply(lambda x:"|".join(x.PredictionString),axis=1)
    
    def save_csv(self):
        self.sample_submission.to_csv(f'submission.csv', index=False)

In [90]:
sub_pred = SubmitPred(test_csv_path,test_path,model_path,tokenizer_path)

In [91]:
sub_pred.run()

In [84]:
sub_pred.save_csv()

![](http://)