In [None]:
w # This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install rich

In [1]:
import numpy as np # linear algebra
import pandas as pd
import random
import json
import re
import os
import pandas as pd
import pickle
from tqdm import tqdm,trange
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score
import torch
from transformers import BertForTokenClassification, AdamW, BertTokenizerFast
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
#from rich.console import Console
#from rich.progress import track
from tqdm import tqdm
from transformers import BertTokenizerFast

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [3]:
test_csv_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
test_path  = '../input/coleridgeinitiative-show-us-the-data/test'
#model_path = '../input/showusdata2'
model_path = '../input/5epochweights'
tokenizer_path = '../input/huggingface-bert/bert-base-cased'

In [13]:
class SubmitPred:
    
    def __init__(self,test_csv_path,test_path,model_path,tokenizer_path,batch_size=64):
        self.test_csv_path = test_csv_path
        self.test_path = test_path
        self.model = BertForTokenClassification.from_pretrained(model_path,num_labels=3,output_attentions=False,output_hidden_states=False)
        self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path,do_lower_case = False)
        self.MAX_LENGTH = 64 # max no. words for each sentence.
        self.OVERLAP = 20 
        self.tag2str = {2:'O',1:'I',0:'B'}
        self.batch_size =batch_size
        
    def load_submission(self):
        self.sample_submission = pd.read_csv(self.test_csv_path)
    
    def tokenize_sent(self,sentence):
        tokenized_sentence = []
        sentence = sentence.split()
        for word in sentence:
            tokenized_word = self.tokenizer.tokenize(word)
            tokenized_sentence.extend(tokenized_word)
        return tokenized_sentence
    
    def read_and_create_csv(self):
        all_test_papers = os.listdir('../input/coleridgeinitiative-show-us-the-data/test')
        self.submission = pd.DataFrame({'Id':all_test_papers})
    
    @staticmethod
    def shorten_sentences(sentences, max_len, overlap):
        short_sentences = []
        for sentence in sentences:
            words = sentence.split()
            if len(words) > max_len:
                for p in range(0, len(words), max_len - overlap):
                    short_sentences.append(' '.join(words[p:p + max_len]))
            else:
                short_sentences.append(sentence)
        return short_sentences
    
    @staticmethod
    def clean_training_text(txt):
        return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()
    
    def add_padding(self,tokenized_sentences):
        padded_sentences = pad_sequences(
            tokenized_sentences, 
            value='[PAD]', 
            dtype=object, 
            maxlen=self.MAX_LENGTH, 
            truncating='post', 
            padding='post')
        return padded_sentences
    @staticmethod
    def get_attention_mask(input_ids, ignore_tokens=[0,101,102]):
        return [[float(token not in ignore_tokens) for token in sent ] for sent in input_ids]
    
    
    @staticmethod
    def jaccard_similarity(list1, list2):
        intersection = len(list(set(list1).intersection(list2)))
        union = (len(list1) + len(list2)) - intersection
        return float(intersection) / union
    
    @staticmethod
    def add_start_end_tokens(tupled_sentence):
        tupled_sentence.insert(0, '[CLS]')
        tupled_sentence.append('[SEP]')
        return tupled_sentence
    
    def run(self):
        self.load_submission()
        self.model.cuda()
        paper_length = []
        sentences_e =  []
        papers = {}
        self.read_and_create_csv()
        for paper_id in self.submission['Id']:
            with open(f'{self.test_path}/{paper_id}', 'r') as f:
                paper = json.load(f)
                papers[paper_id] = paper
        for id in self.submission['Id']:
            paper = papers[id]
            sentences = set([self.clean_training_text(sentence) for section in paper for sentence in section['text'].split('.')])
            sentences = self.shorten_sentences(sentences,self.MAX_LENGTH,self.OVERLAP)
            sentences = [sentence for sentence in sentences if len(sentence) > 10]
            ner_data = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
            sentences_e.extend(ner_data)
            print(f"paper {id} length: {len(ner_data)}")
            paper_length.append(len(ner_data))
        tokenized_words= [self.tokenize_sent(sentence) for sentence in sentences_e]
        start_end = [self.add_start_end_tokens(sentence) for sentence in tokenized_words]
        padding_sentences =  self.add_padding(start_end)
        input_ids = [self.tokenizer.convert_tokens_to_ids(text) for text in padding_sentences]
        attention_mask = self.get_attention_mask(input_ids,ignore_tokens=[0,101,102])
        predicts = torch.tensor(input_ids)
        masks    = torch.tensor(attention_mask)
        predict_data = TensorDataset(predicts, masks)
        predict_dataloader = DataLoader(predict_data, batch_size=self.batch_size)
        all_predictions = []
        for step,batch in enumerate(predict_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask = batch
            with torch.no_grad():
                output = self.model(b_input_ids,attention_mask=b_input_mask)
            label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
            all_predictions.extend(label_indices)
        
        all_preds_str = [[self.tag2str[token]for token in pred] for pred in all_predictions]
        all_sent_str = [self.tokenizer.convert_ids_to_tokens(sent) for sent in input_ids]
        all_sent_int = [ids for ids in input_ids]
        final_predics = []
        all_sent_str_1 = all_sent_int
        all_preds_str_1 = all_preds_str
        for pap_len in paper_length:
            labels = []
            test_all_labels = []
            for sentence,pred in zip(all_sent_str_1[:pap_len],all_preds_str_1[:pap_len]):
                phrase = []
                phrase_test = []
                for word,tag in zip(sentence,pred):
                    
                    if tag =="I" or tag =="B":
                        phrase_test.append(word)
                        if word!= 0 and word!= 101 and word!=102:
                            phrase.append(word)
                    else:
                        if len(phrase)!=0:
                            labels.append(self.tokenizer.decode(phrase))
                            phrase_test = []
                            phrase = []

            final_predics.append(labels)
            del all_sent_str_1[:pap_len], all_preds_str_1[:pap_len]
        final_predics =[[pred for pred in preds if not pred.startswith("#")] for preds in final_predics]
        
        filtered = []
        for final_predic in final_predics:
            filt = []
            for pred in final_predic:
                if len(filt) ==0:
                    filt.append(pred)
                else:
                    flag = 0
                    for filtered_pred in filt:
                        if self.jaccard_similarity(filtered_pred.split(),pred.split())>0.70:

                            flag = 1
                        if flag ==0:
                            filt.append(pred)
        
                            
            filtered.append(filt)
        
        self.final_predics = final_predics
        filtered = ["|".join(filt) if len(filt)!=0 else filt for filt in filtered ]
        self.filtered = filtered
        self.submission['PredictionString'] = filtered
        print("Predictions Complete")
        #self.submission['PredictionString'] = self.submission.apply(lambda x:"|".join(x.PredictionString),axis=1)
    
    def save_csv(self):
        self.submission.to_csv(f'submission.csv', index=False)

In [14]:
sub_pred = SubmitPred(test_csv_path,test_path,model_path,tokenizer_path,256)

In [15]:
sub_pred.run()

paper 8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60.json length: 83
paper 2100032a-7c33-4bff-97ef-690822c43466.json length: 34
paper 2f392438-e215-4169-bebf-21ac4ff253e1.json length: 131
paper 3f316b38-1a24-45a9-8d8c-4e05a42257c6.json length: 98
Current Prediction too similar
Current Prediction too similar
Current Prediction too similar
Current Prediction too similar
Current Prediction too similar


In [16]:
sub_pred.save_csv()

![](http://)