In [1]:
import os
import pandas as pd
import numpy as np
import json
import re
from nltk.tokenize import sent_tokenize 
from transformers import BertTokenizer, AutoTokenizer
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import transformers
from tqdm import tqdm
import glob

import datetime 

import warnings
warnings.filterwarnings('ignore')

# Config

In [2]:
platform = 'Sage'
model_name = 'bert_base_uncased.bin'

if platform == 'Azure':
    bert_path = '/home/thanish/transformer_models/bert_base_uncased'
    test_path = '../test/*'
    model_path = '../output/' + model_name
elif platform == 'Kaggle':
    bert_path = '../input/bertlargeuncasedpytorch'
    test_path = '/kaggle/input/coleridgeinitiative-show-us-the-data/test/*'
    model_path = '../input/coleridgemodels/'+ model_name
else:
    bert_path = 'C:/Users/thanisb/Documents/transformer_models/bert_base_uncased/'
    test_path = '../test/*'
    model_path = '../output/' + model_name
    
config = {'MAX_LEN':128,
          'tokenizer': AutoTokenizer.from_pretrained('bert-base-uncased' , do_lower_case=True),
          'batch_size':5,
          'Epoch': 10,
          'test_path':test_path, 
          'device': 'cuda' if torch.cuda.is_available() else 'cpu',
          'model_path':model_path
         }

# Reading the train csv

In [23]:
train = pd.read_csv("../train.csv")
train

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
...,...,...,...,...,...
19656,b3498176-8832-4033-aea6-b5ea85ea04c4,RSNA International Trends: A Global Perspectiv...,RSNA International COVID-19 Open Radiology Dat...,RSNA International COVID Open Radiology Database,rsna international covid open radiology database
19657,f77eb51f-c3ac-420b-9586-cb187849c321,MCCS: a novel recognition pattern-based method...,CAS COVID-19 antiviral candidate compounds dat...,CAS COVID-19 antiviral candidate compounds dat...,cas covid 19 antiviral candidate compounds dat...
19658,ab59bcdd-7b7c-4107-93f5-0ccaf749236c,Quantitative Structure–Activity Relationship M...,CAS COVID-19 antiviral candidate compounds dat...,CAS COVID-19 antiviral candidate compounds dat...,cas covid 19 antiviral candidate compounds dat...
19659,fd23e7e0-a5d2-4f98-992d-9209c85153bb,A ligand-based computational drug repurposing ...,CAS COVID-19 antiviral candidate compounds dat...,CAS COVID-19 antiviral candidate compounds dat...,cas covid 19 antiviral candidate compounds dat...


# Getting the extra label

In [106]:
extra_label_800 = pd.read_csv("../extra_data/data_set_800.csv")
extra_label_26897 = pd.read_csv("../extra_data/data_set_26897.csv")

print(extra_label_26897.shape)
print(extra_label_800.shape)

extra_label_final = pd.concat([extra_label_26897, extra_label_800]).reset_index(drop=True)
print(extra_label_final.shape)

extra_label_final.title = extra_label_final.title.apply(lambda x: clean_text(x))

# Removing the train label
unique_extra_label_final = np.setdiff1d(extra_label_final.title, train.cleaned_label)
print(unique_extra_label_final.shape)


(26896, 1)
(2339, 1)
(29235, 1)
(23500,)


# Combining the labels together

In [110]:
train_df = train.groupby(['Id']).agg(label_count = ('cleaned_label', 'count'),
                                     label = ('cleaned_label', '|'.join)).reset_index()
train_df

Unnamed: 0,Id,label_count,label
0,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,1,program for the international assessment of ad...
1,0008656f-0ba2-4632-8602-3017b44c2e90,1,trends in international mathematics and scienc...
2,000e04d6-d6ef-442f-b070-4309493221ba,1,agricultural resources management survey
3,000efc17-13d8-433d-8f62-a3932fe4f3b8,2,adni|alzheimer s disease neuroimaging initiati...
4,0010357a-6365-4e5f-b982-582e6d32c3ee,1,genome sequence of covid 19
...,...,...,...
14311,ffd19b3c-f941-45e5-9382-934b5041ec96,1,census of agriculture
14312,ffd4d86a-0f26-44cc-baed-f0e209cc22af,1,alzheimer s disease neuroimaging initiative adni
14313,ffe7f334-245a-4de7-b600-d7ff4e28bfca,1,genome sequences of sars cov 2
14314,ffeb3568-7aed-4dbe-b177-cbd7f46f34af,1,trends in international mathematics and scienc...


# Reading all the json train files

In [111]:
def read_all_json(df):
    '''
    This function reads all the json input files and return a dictionary containing the id as the key
    and all the contents of the json as values
    '''
    text_data = {}
    for i, rec_id in tqdm(enumerate(df.Id), total = len(df.Id)):
        location = f'../train/{rec_id}.json'

        with open(location, 'r') as f:
            text_data[rec_id] = json.load(f)
        
    print("All files read")
    end = datetime.datetime.now()
    
    return text_data

In [112]:
%time 
data_dict = read_all_json(df=train_df)

  4%|▍         | 576/14316 [00:00<00:02, 5758.73it/s]

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs


100%|██████████| 14316/14316 [00:02<00:00, 5434.17it/s]

All files read





In [89]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [114]:
def data_joining(data_dict_id):
    '''
    This function is to join all the text data from different sections in the json to a single
    text file. 
    '''
    data_length = len(data_dict_id)

    #     temp = [clean_text(data_dict_id[i]['text']) for i in range(data_length)]
    temp = [data_dict_id[i]['text'] for i in range(data_length)]
    temp = '. '.join(temp)
    
    return temp

In [116]:
def make_shorter_sentence(sentence):
    sent_tokenized = sent_tokenize(sentence)
    
    max_length = config['MAX_LEN']
    overlap = 20
    
    final_sentences = []
    
    for tokenized_sent in sent_tokenized:
        sent_tokenized_clean = clean_text(tokenized_sent)
        sent_tokenized_clean = sent_tokenized_clean.replace('.','').rstrip() 
        
        tok_sent = sent_tokenized_clean.split(" ")
        
        if len(tok_sent)<max_length:
            final_sentences.append(sent_tokenized_clean)
        else :
#             print("Making shorter sentences")
            start = 0
            end = len(tok_sent)
            
            for i in range(start, end, max_length-overlap):
                temp = tok_sent[i: (i + max_length)]
                final_sentences.append(" ".join(i for i in temp))

    return final_sentences

In [118]:
# This is the original sentence

def form_labels(sentence, labels_list):
    '''
    This function labels the training data 
    '''
    matched_kwords = []
    matched_token = []
    un_matched_kwords = []
    label = []

    # Since there are many sentences which are more than 512. Let's make the max length of all
    # the sentences be 64
    tokens = make_shorter_sentence(sentence)
    
    for tok in tokens:    
        tok_split = config['tokenizer'].tokenize(tok)
        
        z = np.array(['O'] * len(tok_split)) # Create final label == len(tokens) of each sentence
        matched_keywords = 0 # Initially no kword matched    

        for kword in labels_list:
            kword_split = config['tokenizer'].tokenize(kword)
            for i in range(len(tok_split)):
                if tok_split[i: (i + len(kword_split))] == kword_split:
                    matched_keywords += 1
#                     print("matched keyword with token:", tok_split[i: (i+len(kword_split))] )
#                     print(tok_split)

                    if (len(kword_split) == 1):
                        z[i] = 'B'
                    else:
                        z[i] = 'B'
                        z[(i+1) : (i+ len(kword_split))]= 'B'

                    if matched_keywords >1:
                        label[-1] = (z.tolist())
                        matched_token[-1] = tok
                        matched_kwords[-1].append(kword)
                    else:
                        label.append(z.tolist())
                        matched_token.append(tok)
                        matched_kwords.append([kword])
                    #print(label[-1])
                    #print("")
    #                 break
                else:
                    un_matched_kwords.append(tok)
                
    return matched_token, matched_kwords, label, un_matched_kwords

In [119]:
def labelling(dataset, data_dict):
    
    Id_list_ = []
    sentences_ = []
    key_ = []
    labels_ = []
    un_mat = []
    un_matched_reviews = 0

    for i, Id in tqdm(enumerate(dataset.Id), total=len(dataset.Id)):

        sentence = data_joining(data_dict[Id])
        labels = train_df.label[train_df.Id == Id].tolist()[0].split("|")

        s, k, l, un_matched = form_labels(sentence=sentence, labels_list = labels)

        if len(s) == 0:
            un_matched_reviews += 1
            un_mat.append(un_matched)
        else: 
            sentences_.append(s)
            key_.append(k)
            labels_.append(l)
            Id_list_.append([Id]*len(l))

        if (i%100) == 0:
            print(f"Completed {i}/{train_df.Id.shape[0]}")

    print("Total unmatched keywords:", un_matched_reviews)
    sentences = [item for sublist in sentences_ for item in sublist]
    final_labels = [item for sublist in labels_ for item in sublist]
    keywords = [item for sublist in key_ for item in sublist]
    Id_list = [item for sublist in Id_list_ for item in sublist]
    
    return sentences, final_labels, keywords, Id_list


In [1]:
train_sentences, train_labels, train_keywords, train_Id_list = labelling(dataset = train_df, data_dict=data_dict)
valid_sentences, valid_labels, valid_keywords, valid_Id_list = labelling(dataset = DF_valid)

print("")
print(f" train sentences: {len(train_sentences)}, train label: {len(train_labels)}, train keywords: {len(train_keywords)}, train_id list: {len(train_Id_list)}")

# Create DataFrame to remove the duplicates

In [217]:
unique_df = pd.DataFrame({'id':train_Id_list, 
                          'train_sentences': train_sentences, 
                          'kword': train_keywords, 
                          'label':train_labels})
unique_df.label = unique_df.label.astype('str')
unique_df.kword = unique_df.kword.astype('str')
unique_df['sent_len'] = unique_df.train_sentences.apply(lambda x : len(x.split(" ")))
unique_df.head(60)

Unnamed: 0,id,train_sentences,kword,label,sent_len
0,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,in fact organizations are now identifying digi...,['program for the international assessment of ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",45
1,0008656f-0ba2-4632-8602-3017b44c2e90,besides not enough young people are entering s...,['trends in international mathematics and scie...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",94
2,000e04d6-d6ef-442f-b070-4309493221ba,1 manages access to results of the agricultura...,['agricultural resources management survey'],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'B', ...",26
3,000e04d6-d6ef-442f-b070-4309493221ba,the agricultural resources management survey a...,['agricultural resources management survey'],"['O', 'B', 'B', 'B', 'B', 'O', 'O', 'O', 'O', ...",29
4,000efc17-13d8-433d-8f62-a3932fe4f3b8,genetic and neuroimaging data on a sub sample ...,"['adni', 'alzheimer s disease neuroimaging ini...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",26
5,000efc17-13d8-433d-8f62-a3932fe4f3b8,this study used data from the nacc and adni da...,['adni'],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",11
6,000efc17-13d8-433d-8f62-a3932fe4f3b8,patient recruitment neuroimaging acquisition a...,['adni'],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",30
7,000efc17-13d8-433d-8f62-a3932fe4f3b8,the adni data set is from a multicenter longit...,['adni'],"['O', 'B', 'B', 'O', 'O', 'O', 'O', 'O', 'O', ...",35
8,000efc17-13d8-433d-8f62-a3932fe4f3b8,neither other scans nor snps were assessed fro...,['adni'],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",9
9,000efc17-13d8-433d-8f62-a3932fe4f3b8,54 neuroimaging and genetic parameters in adni...,"['adni', 'adni']","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",28


# Write the interim dataset

In [136]:
unique_df.to_csv("../unique_train_df_5.csv", index=False)