In [None]:
dev_articles_path = '/input/dev-articles/dev-articles/'
test_articles_path = '/input/testarticles/test-articles/'
data_path = '/input/data/'

In [1]:
from transformers import BertPreTrainedModel,BertModel, BertConfig
from torch import nn
from torch.nn import CrossEntropyLoss
class BertForTokenClassificationWeighted(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
#         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.cnn1 = nn.Conv1d(768, 128, kernel_size=3, padding=1)
        self.cnn2 = nn.Conv1d(128, config.num_labels, kernel_size=3, padding=1)
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]
        sequence_output=sequence_output.permute(0,2,1)
#         sequence_output = self.dropout(sequence_output)
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.cnn2(self.cnn1((dropout(sequence_output))))
            else:
                logits += self.cnn2(self.cnn1(dropout(sequence_output)))
        logits=logits/ len(self.dropouts)
        logits=logits.permute(0,2,1)

#         logits = self.classifier(sequence_output)
#         print(logits.shape)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            w = torch.tensor([1.,10.,1. ]).cuda()
            loss_fct = CrossEntropyLoss(weight=w)
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.reshape(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)


In [2]:
from transformers import BertTokenizer, BertConfig,WordpieceTokenizer
from transformers import DistilBertForTokenClassification,DistilBertTokenizer,DistilBertConfig
from transformers import RobertaForTokenClassification,RobertaTokenizer,RobertaConfig
from transformers import BertForTokenClassification,BertTokenizer,BertConfig
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW

import pandas as pd
import numpy as np
from tqdm import tqdm, trange

global max_len
data = pd.read_csv(data_path + "data_prop.csv")
data = data.fillna(method="ffill")

data['label'] = data['label'].astype(str)

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 0
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p) for w, p in zip(s["word_corrected"].values.tolist(),
                                                           s["label"].values.tolist())]
        
        self.grouped = self.data.groupby("sent_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

getter = SentenceGetter(data)


sent = getter.sentences

for s in sent[0]:
    print(s[0].split()[0])
    

##keeping only the first word after removing the special characters
sentences = [" ".join([s[0].split()[0] for s in sent]) for sent in getter.sentences]
print(sentences[0])

labels = [[s[1] for s in sent] for sent in getter.sentences]
print(labels[0])

# tags_vals = list(set(data["label"].values))
# tag2idx = {t: i for i, t in enumerate(tags_vals)}

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,WeightedRandomSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

MAX_LEN = 128
bs = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print(n_gpu)
print(torch.cuda.get_device_name(0))
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

### Now we tokenize all sentences


tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]


print(sentences[0])
print(tokenized_texts[0])

def reg_encoding(cleaned: list, labels: list, hash_token:list) -> list:
    label_l = []
    for oindex, x in enumerate(cleaned):
        #print(oindex)
        tlist = []        
        i=0
        j=0
        while i < len(x): 
            if x[i][0]=='#':        
                tlist.append(hash_token)
            else:
                #print(x[i])
                tlist.append(labels[oindex][j])
                j=j+1
            i=i+1
            
        label_l.append(tlist)
    return label_l

def reg_encoding_generic(sentences,labels, hash_token):
    tokens_all=[]
    labels_all=[]
    for (sentence,label) in zip(sentences,labels):
        tokens_per_sentence=[]
        labels_per_sentence=[]
        for word, label in zip(sentence.split(), label):
            word_tokens = tokenizer.tokenize(word)
            if len(word_tokens) > 0:
                tokens_per_sentence.extend(word_tokens)
                labels_per_sentence.extend([label] + [hash_token] * (len(word_tokens) - 1))
#                 labels_per_sentence.extend([label] + [label] * (len(word_tokens) - 1))

        
        tokens_all.append(tokens_per_sentence)
        labels_all.append(labels_per_sentence)
                
    return tokens_all,labels_all

def reg_encoding_modify(cleaned: list, labels: list, hash_token, end_token) -> list:
    label_l = []
    for oindex, x in enumerate(cleaned):
        #print(oindex)
        tlist = []        
        i=0
        j=0
        while i < len(x): 
            if x[i][0]=='#':        
                #tlist.append(hash_token)
                tlist.append(labels[oindex][j-1])
                
            else:
                #print(x[i])
                tlist.append(labels[oindex][j])
                j=j+1
            i=i+1
            
        label_l.append(tlist)
    return label_l


tokenized_texts,label_l = reg_encoding_generic(sentences,labels,'X')


flat_list = [item for sublist in label_l for item in sublist]

from collections import Counter


#label_l_modify = reg_encoding_modify(tokenized_texts,labels,'X','E')

# data['label'].unique()

tags_vals=['0','1','X']
tag2idx={'0':0,'1':1,'X':2}

# tags_vals=['0','1']
# tag2idx={'0':0,'1':1}



input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")


tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in label_l],maxlen=MAX_LEN, value =0, padding="post",dtype="long", truncating="post")

attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,random_state=2018, test_size=0.1)

sent_labels=np.zeros(tr_tags.shape[0],dtype=int)
for i in range(len(tr_tags)):
    if 1 in tr_tags[i,:]:
        sent_labels[i]=1
    
del input_ids,tags,attention_masks
print(tr_inputs.shape)

tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags) 
train_sampler = RandomSampler(train_data)
_,class_sample_count=np.unique( sent_labels , return_counts=True)   
class_sample_counts=list(class_sample_count)
class_sample_count = [10, 1, 20, 3, 4] # dataset has 10 class-1 samples, 1 class-2 samples, etc.
weights = 1. / torch.tensor(class_sample_counts, dtype=torch.float)
samples_weights = weights[sent_labels]
sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weights, num_samples=len(samples_weights),replacement=True)
train_dataloader = DataLoader(train_data, batch_size = bs, sampler = sampler)

# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)


# model = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(tag2idx))
# model = RobertaForTokenClassification.from_pretrained("roberta-base", num_labels=len(tag2idx))

# model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))
model = BertForTokenClassificationWeighted.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))




model.cuda()

FULL_FINETUNING = True

if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=2, num_training_steps=5)
# opt = SWA(optimizer, swa_start=10, swa_freq=2, swa_lr=0.05)


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

import pickle



epochs = 7
max_grad_norm = 1.0
tmp=0


for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
        # forward pass
        outputs = model(**inputs)
        loss = outputs[0]
#         loss = model(b_input_ids, token_type_ids=None,
#                      attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
#         scheduler.step()
#         opt.step()
        model.zero_grad()
    
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
#     opt.swap_swa_sgd()

    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    pred=[]
    
    for batch in (valid_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
        
        with torch.no_grad():
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]
#             tmp_eval_loss = model(b_input_ids, token_type_ids=None,
#                                   attention_mask=b_input_mask, labels=b_labels)
#             logits = model(b_input_ids, token_type_ids=None,
#                            attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        
        #print(logits.shape)
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        #
        pred.append(list(p) for p in np.argmax(logits, axis=2))
        
        #
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    
    torch.save(model.state_dict(), 'bert_uncased_attentionmask.pth')
    
    from sklearn.metrics import f1_score
    tmp_f1=f1_score(pred_tags , valid_tags,average='macro')
    if tmp_f1>tmp:
        torch.save(model.state_dict(), 'bert_uncased_SI.pth')
    print("F1-Score: {}".format(f1_score(pred_tags , valid_tags,average='macro')))
    
 
    

south
florida
muslim
leader
sofian
zakkout
david
duke
day
south florida muslim leader sofian zakkout david duke day
['0', '0', '0', '0', '0', '0', '0', '0', '0']


Using TensorFlow backend.


1
Tesla P100-PCIE-16GB


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


south florida muslim leader sofian zakkout david duke day
['south', 'florida', 'muslim', 'leader', 'sofia', '##n', 'za', '##kko', '##ut', 'david', 'duke', 'day']
(14885, 128)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=361.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

Train loss: 0.5036991468506822
Validation loss: 0.6081043768387574
Validation Accuracy: 0.6469854608282343


Epoch:  14%|█▍        | 1/7 [03:22<20:13, 202.20s/it]

F1-Score: 0.4412132930614571
Train loss: 0.3058424625114054
Validation loss: 0.7097693873712649
Validation Accuracy: 0.7329933860085227


Epoch:  29%|██▊       | 2/7 [06:44<16:51, 202.34s/it]

F1-Score: 0.4578833283953121
Train loss: 0.2055117714352698
Validation loss: 0.958174603489729
Validation Accuracy: 0.7646859975961539


Epoch:  43%|████▎     | 3/7 [10:06<13:28, 202.24s/it]

F1-Score: 0.46052171257271723
Train loss: 0.138776995457485
Validation loss: 1.136403695322
Validation Accuracy: 0.7766100476671766


Epoch:  57%|█████▋    | 4/7 [13:28<10:06, 202.12s/it]

F1-Score: 0.4866751515363561
Train loss: 0.09758572739273182
Validation loss: 1.7093272203436265
Validation Accuracy: 0.8269153941761365


Epoch:  71%|███████▏  | 5/7 [16:50<06:44, 202.12s/it]

F1-Score: 0.49178370314466774
Train loss: 0.07719366211651989
Validation loss: 2.059458752664236
Validation Accuracy: 0.8518984067690122


Epoch:  86%|████████▌ | 6/7 [20:12<03:22, 202.03s/it]

F1-Score: 0.49916521697148014
Train loss: 0.060983501229826924
Validation loss: 1.7349001346872404
Validation Accuracy: 0.8194763610413024


Epoch: 100%|██████████| 7/7 [23:33<00:00, 201.93s/it]

F1-Score: 0.5273995019771422





Model Predictions

In [3]:
cnn1 = nn.Conv1d(768, 128, kernel_size=3, padding=1)
cnn2 = nn.Conv1d(128, 3, kernel_size=3, padding=1)

In [4]:
inp = torch.randn(32, 128,768)
inp=inp.permute(0,2,1)
inp.size()
output = cnn2(cnn1(inp))
output=output.permute(0,2,1)
print(inp.size()
,output.size())

torch.Size([32, 768, 128]) torch.Size([32, 128, 3])


In [5]:
#Loading the model
# model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))    

# model.load_state_dict(torch.load('/kaggle/output/bert-new/bert_uncased_SI.pth'))
# model.load_state_dict(torch.load('/kaggle/working/bert-new/bert_uncased_SI.pth'))


# model.cuda()



# Fetching test data

def text_preprocessing(text):
    text= text.lower()
    text= re.sub(r'[^a-z]',' ',text)
    text= ' '.join(text.split())
    return(text)
def get_wordchar_indicies(sent):
    k_l = list(sent)
    k_l_b = [0 if i==' ' else 1 for i in k_l]
    k_df = pd.DataFrame({'char':k_l, 'space_mark':k_l_b})
    k_df = k_df.reset_index()
    k_df['u1'] = k_df['space_mark'].diff()
    k_df['u1'].fillna(1, inplace=True)
    k_df.loc[k_df['u1']==1, 'u2']= k_df.loc[k_df['u1']==1, 'u1'].cumsum()
    k_df.loc[k_df['u1']==-1, 'u2']= k_df.loc[k_df['u1']==-1, 'u1']
    k_df['u2'] = k_df['u2'].ffill(axis=0)
    k_df = k_df[k_df['u2']!=-1]
    k_df_gb = pd.DataFrame(k_df.groupby(['u2'])['index'].min())
    k_df_gb['last_index_word'] = k_df.groupby(['u2'])['index'].max()
    k_df_gb = k_df_gb.reset_index().rename(columns={'u2':'word_index','index':'first_index_word'})
    try:
        k_df_gb['words'] = sent.split()
    except:
        print(k_df_gb, sent)
    return k_df_gb

def indices_sentence(article_id,path):
    f= open(path + 'article' + str(article_id) + '.txt',"r")
    indices={}
    start_index = 0
    for i, line in enumerate(f):
        indices[i] = {}
        indices[i]['article_id']=article_id
        indices[i]['span_present'] = 0
        indices[i]['sentence'] = line
        indices[i]['start_index'] = start_index
        indices[i]['end_index'] = start_index + len(line)
        start_index = indices[i]['end_index']   
        
        if line == '\n':
            indices[i]['word_st_index'] = [0]
            indices[i]['word_en_index'] = [0]
        else:
            wordchar_df = get_wordchar_indicies(line)
            indices[i]['word_st_index'] = list(wordchar_df['first_index_word'])
            indices[i]['word_en_index'] = list(wordchar_df['last_index_word'])        
        
    return indices
def get_test_sentences(article_ids, dev_id, articles_path):
    se_dict=indices_sentence(article_ids[dev_id],articles_path)
    test_sent=[]
    for i in range(len(se_dict)):
                   test_sent.append(se_dict[i]['sentence'])
    return se_dict, test_sent



def prep_test_dataloader(article_ids, dev_id, articles_path):
    se_dict,test_sentences=get_test_sentences(article_ids,dev_id,articles_path)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    
    tokenized_test_sentences = [tokenizer.tokenize(sent) for sent in test_sentences]
    input_test_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_test_sentences],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    attention_test_masks = [[float(i>0) for i in ii] for ii in input_test_ids]
    test_inputs = torch.tensor(input_test_ids)

    test_masks = torch.tensor(attention_test_masks)
    test_data = TensorDataset(test_inputs, test_masks)

    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)
    return test_dataloader,test_sentences


def inverse_reg_enco(sentences,labels_tok):
    label_inverse=[]
    for i,sent in enumerate(sentences) :
        label_o=[]
        word_count=0
        tok_count=0
        for word in sent.split():            
            #print(word)
            tok=tokenizer.tokenize(word)
            tok_count=len(tok)+tok_count
            #print(tok_count)
            if tok_count>MAX_LEN:
                break    
            #print(word_count)
            if len(tok)>1:
                #print('True')
                label_word_token=labels_tok[i][word_count:word_count+len(tok)]
                #print(label_word_token)
                if '1' in label_word_token:
                    label_o.append('1')
                else:
                    label_o.append('0')
                word_count=word_count+len(tok)
            else:
                #print(labels_tok[i][word_count])
                label_o.append(labels_tok[i][word_count])
                word_count=word_count+1    
        label_inverse.append(label_o)
    return label_inverse

# out_label_list = [[] for _ in range(out_label_ids.shape[0])]
# preds_list = [[] for _ in range(out_label_ids.shape[0])]
# def inverse_reg_enco_modify(sentences,labels_tok):
#     for i in range(labels_tok.shape[0]):
#         for j in range(labels_tok.shape[1]):
#             if labels_tok[i, j] != 2:
#                 out_label_list[i].append(label_map[out_label_ids[i][j]])
#                 preds_list[i].append(label_map[preds[i][j]])
    
    
    

def thresh_logit(logits,thresh):
    predictions_per_batch=[]
    for i in range(logits.shape[0]):
        predictions_per_sentence=[]
        for j in range(logits.shape[1]):
            if (np.argmax(logits[i][j])==1 and logits[i][j][1]>=thresh):
                predictions_per_sentence.append(1)
            else:
                predictions_per_sentence.append(0)
        predictions_per_batch.append(predictions_per_sentence)
    return predictions_per_batch 


#from seqeval.metrics import f1_score
def predict_bert(model,test_dataloader,test_sentences,thresh,threshold=False):
    
    model.eval()
    predictions = []
    l=[]
    #true_labels = []
    #eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask = batch
        inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
        with torch.no_grad():
            #tmp_eval_loss = model(b_input_ids, token_type_ids=None,
            #                      attention_mask=b_input_mask, labels=b_labels)
#             logits = model(b_input_ids, token_type_ids=None,
#                            attention_mask=b_input_mask)
            outputs = model(**inputs)
            logits = outputs[0]
            

        m=torch.nn.Softmax(dim=2)
        logits_softmax=m(logits)
        logits = logits.detach().cpu().numpy()
        logits_softmax = logits_softmax.detach().cpu().numpy()
        #print(logits.shape)
        #print(logits_softmax.shape)
        #l.extend(logits_softmax)
        if threshold:
            predictions_per_batch=thresh_logit(logits,thresh)
            predictions.extend(predictions_per_batch)
        else:
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1

    pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
    pred_tags_l=inverse_reg_enco(test_sentences,pred_tags)
    padded_pred_tags_l=pad_sequences([[tag2idx.get(l) for l in lab] for lab in pred_tags_l],
                     maxlen=MAX_LEN, value = tag2idx['0'], padding="post",
                     dtype="long", truncating="post")
    return padded_pred_tags_l,pred_tags


def get_spans_article(article_ids, dev_id, articles_path,padded_pred_tags_l):
    pad_style='Post'
    spangaps_to_merge = [1, 2]
    def get_truewordindex(wi, sl):
        if sl>=max_len:
            return wi
        else:
            return wi-(max_len-sl)
    ar=np.array(padded_pred_tags_l)
    #print(padded_pred_tags_l.shape)
    a,b=np.where(ar==1)
    
    df_pred=pd.DataFrame({'sent_id':a,'word_index':b})
    #print(len(df_pred))
    meta_dict, test_sentences= get_test_sentences(article_ids, dev_id, articles_path)
    org_sent_len = len(test_sentences)

    len_sentence={i:len(test_sentences[i].split()) for i in range(len(test_sentences))}
    #print(len_sentence)
    df_pred['sent_length']=df_pred['sent_id'].map(len_sentence)
    df_pred = df_pred[df_pred['sent_id']<org_sent_len]

    # if len(df_pred)==0:
    #     return df_pred, df_pred, df_pred
    # df_pred['true_word_index']=df_pred['word_index']-(max_len-df_pred['sent_length'])
    if pad_style == 'pre':
        df_pred['true_word_index'] = list(map(lambda x, y: get_truewordindex(x, y) , 
                                          df_pred['word_index'], df_pred['sent_length']))
    else:
        df_pred['true_word_index'] = df_pred['word_index'].values
    #print(df_pred)
    df_pred = df_pred[~(df_pred['true_word_index']>=df_pred['sent_length'])] 
    
    #print(len(df_pred))
    df_pred['diff_pred']=df_pred.groupby(['sent_id'])['true_word_index'].diff()
    df_pred['diff_pred'] = df_pred['diff_pred'].apply(
                        lambda x:-1 if x in spangaps_to_merge else np.nan)
    #print (len(df_pred))
    #print(df_pred)
    if len(df_pred)<1:
        print("Escaping empty dataframe :")
        print(dev_id)
        submsn_df=pd.DataFrame()
        sent_pred=pd.DataFrame()
        return df_pred, sent_pred, submsn_df
        pass
        
    else:
        df_pred.loc[df_pred['diff_pred'].isnull(), 'diff_pred_1'] = \
                    df_pred.groupby(['sent_id'])['diff_pred'].cumcount()
        df_pred['diff_pred_1'] = df_pred['diff_pred_1'].ffill(axis=0)
        df_pred['span_id'] = list(map(lambda x, y: str(int(x)) + '_' + str(int(y)), 
                                      df_pred['sent_id'], df_pred['diff_pred_1']))
        # req_cols = ['sent_id', 'word_index', 'sent_length', 'true_word_index', 'span_id']
        # df_pred = df_pred[req_cols]
        # df_pred.loc[df_pred['true_word_index']==-1, 'true_word_index'] = 0
        sent_pred = pd.DataFrame(df_pred.groupby(['sent_id', 'span_id'])['true_word_index'].min())
        sent_pred['span_max_word_index'] = df_pred.groupby(['sent_id', 'span_id'])['true_word_index'].max()
        sent_pred = sent_pred.rename(columns={'true_word_index':'span_min_word_index'}).reset_index()
        #print(sent_pred)
        submsn_df = pd.DataFrame()
        for i, _id in enumerate(sent_pred['span_id'].tolist()):
            #print(_id)
            submsn_df.loc[i, 'article_id'] = article_ids[dev_id]
            span_min_word = sent_pred.loc[sent_pred['span_id']==_id, 'span_min_word_index'].values[0]
            #print(span_min_word)
            span_max_word = sent_pred.loc[sent_pred['span_id']==_id, 'span_max_word_index'].values[0]
            #print(span_max_word)
            sentence_id = int(_id.split('_')[0])
            sent_start_index = meta_dict[sentence_id]['start_index']
            try:
                submsn_df.loc[i, 'span_start'] = sent_start_index + meta_dict[sentence_id]['word_st_index'][span_min_word]
                submsn_df.loc[i, 'span_end'] = sent_start_index + meta_dict[sentence_id]['word_en_index'][span_max_word] 
            except:
                print("Escaping submsn_df :" )
                print(dev_id,_id)

        return df_pred, sent_pred, submsn_df


import os

dev_article_ids = [int(file.replace('article', '').replace('.txt', '')) for file in os.listdir(dev_articles_path)]
thresh=0.8
list_article=[]
main_span_df = pd.DataFrame()
for dev_id in tqdm(range(len(dev_article_ids))):
    test_dataloader,test_sentences=prep_test_dataloader(dev_article_ids, dev_id, dev_articles_path)
    padded_pred_tags_l,_=predict_bert(model,test_dataloader,test_sentences,thresh,threshold=False)
    list_article.append(padded_pred_tags_l)
    a, s,submsn_df = get_spans_article(dev_article_ids, dev_id, dev_articles_path,padded_pred_tags_l)
    #print(dev_id)
    #print(len(submsn_df))
    main_span_df = pd.concat([main_span_df,submsn_df],sort=False,axis=0)
print(main_span_df.shape)

mdf=main_span_df[(main_span_df['span_end']>main_span_df['span_start'])]
print(mdf.shape)

_sub_ver='0'
np.savetxt('bert_submsn_dev'+str(_sub_ver)+'.txt',mdf.values, fmt='%d', delimiter='\t')

with open('predictions_dev.pkl', 'wb') as f:
    pickle.dump(list_article, f)
    

 95%|█████████▍| 71/75 [02:30<00:07,  1.95s/it]

Escaping empty dataframe :
70


100%|██████████| 75/75 [02:41<00:00,  2.15s/it]

(1330, 3)
(1330, 3)





In [6]:
!python3 /kaggle/input/scoring/scoring/tools/task-SI_scorer.py -s /kaggle/working/bert_submsn_dev0.txt -r /kaggle/input/scoring/scoring/dev-labels-task1-span-identification/ -m


2020-04-23 08:56:45,281 - INFO - Checking user submitted file
2020-04-23 08:56:45,435 - INFO - Scoring the submission with precision and recall method
2020-04-23 08:56:45,889 - INFO - Precision=430.793239/1330=0.323905	Recall=558.129344/940=0.593755
2020-04-23 08:56:45,889 - INFO - F1=0.419153


In [7]:
import os
test_article_ids = [int(file.replace('article', '').replace('.txt', '')) for file in os.listdir(test_articles_path)]
thresh=0.8
list_article=[]
main_span_df = pd.DataFrame()
for test_id in tqdm(range(len(test_article_ids))):
    test_dataloader,test_sentences=prep_test_dataloader(test_article_ids, test_id, test_articles_path)
    padded_pred_tags_l,_=predict_bert(model,test_dataloader,test_sentences,thresh,threshold=False)
    list_article.append(padded_pred_tags_l)
    a, s,submsn_df = get_spans_article(test_article_ids, test_id, test_articles_path,padded_pred_tags_l)
    #print(dev_id)
    #print(len(submsn_df))
    main_span_df = pd.concat([main_span_df,submsn_df],sort=False,axis=0)
print(main_span_df.shape)

mdf=main_span_df[(main_span_df['span_end']>main_span_df['span_start'])]
print(mdf.shape)

_sub_ver='0'
np.savetxt('bert_submsn_test'+str(_sub_ver)+'.txt',mdf.values, fmt='%d', delimiter='\t')

with open('predictions_test.pkl', 'wb') as f:
    pickle.dump(list_article, f)

 97%|█████████▋| 87/90 [02:45<00:04,  1.40s/it]

Escaping empty dataframe :
86


100%|██████████| 90/90 [02:51<00:00,  1.91s/it]

(1669, 3)
(1668, 3)





In [8]:
!python3 /kaggle/input/scoring/scoring/tools/task-SI_scorer.py -s /kaggle/working/bert_submsn_test0.txt -r /kaggle/input/scoring/scoring/test-labels-task1-span-identification/ -m

2020-04-23 08:59:38,506 - INFO - Checking user submitted file
2020-04-23 08:59:38,713 - INFO - Scoring the submission with precision and recall method
2020-04-23 08:59:39,303 - INFO - Precision=699.328697/1668=0.419262	Recall=728.083724/1379=0.527979
2020-04-23 08:59:39,304 - INFO - F1=0.467382
