In [1]:
################################################################################
# Verify to make sure we understand the json format properly
# Each json file contains one root list, each item in the list has two parts:
# A section_title and a text
################################################################################

import os
import json
total_training_files = 0
for dirname, _, filenames in os.walk('/kaggle/input/coleridgeinitiative-show-us-the-data/train'):
    for filename in filenames:
        file_path = os.path.join(dirname, filename)
        with open(file_path) as json_file:
            data = json.load(json_file)
            total_training_files += 1
            for section in data:
                if len(section) != 2:
                    print(file_path)
#         section_title = section['section_title']
#         text = section['text']
#         print('----section----')
#         print(section)

print(f'Total training files: {total_training_files}')
print('All checked')

Total training files: 14316
All checked


# NER data preparation

In [None]:
import csv
import pandas as pd
import json
import nltk
import csv
from nltk.tokenize import word_tokenize
import re

df = pd.read_csv(
    '/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv',
    index_col=None
)

In [None]:
def worthy_negative(sentence):
    # If a sentence contains Xxxx Xxxx Xxxx or (XYZ) then output it as negative sample
    bracket_pattern = re.compile(re.escape('(')+'[A-Z][A-Z][A-Z]+'+re.escape(')'))
    capital_pattern = re.compile('[A-Z][A-Za-z0-9]'+re.escape(' ')+'[A-Z][A-Za-z0-9]+')
    
    if bracket_pattern.search(sentence) is not None:
        return True
    
    if capital_pattern.search(sentence) is not None:
        return True
    
    return False

# For each fid, get all dataset_labels
def get_fid_labels_dict():
    processed = 0
    fid_labels_dict = {}
    
    for index, row in df.iterrows():
        processed += 1
        if processed % 2000 == 0:
            print(f'Processed {processed} documents')
            
        fid = row['Id']
        dataset_label = row['dataset_label']
        
        if fid not in fid_labels_dict:
            fid_labels_dict[fid] = set()
            
        fid_labels_dict[fid].add(dataset_label)
        
    return fid_labels_dict

def generate_sentence_label_dataset(save_csv_file_path):
    fid_labels_dict = get_fid_labels_dict()
    
#     sentences = []
    sid = 0
    processed = 0
    with open(save_csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        
        for index, row in df.iterrows():
            processed += 1
            if processed % 2000 == 0:
                print(f'Processed {processed} documents')    

            fid = row['Id']
            labels = fid_labels_dict[fid]        

            with open(f'/kaggle/input/coleridgeinitiative-show-us-the-data/train/{fid}.json') as json_file:
                data = json.load(json_file)

                for section in data:
                    text = section['text']
                    section_sentences = nltk.sent_tokenize(text)
                    for s in section_sentences:
                        # For each sentence, find contained labels, if multiple labels are found, put all into the contained_labels list
                        contained_labels = []
                        for label in labels:
                            if label in s:
                                contained_labels.append(label)

                        if len(contained_labels) > 0:
                            writer.writerow([fid, f'S{sid}', s, '||'.join(contained_labels)])
                        elif worthy_negative(s):
                            writer.writerow([fid, f'S{sid}', s, None])

                        sid += 1

    print(f'Total training sentences: {sid+1}')

In [None]:
generate_sentence_label_dataset('./ner_sentences.csv')

In [None]:
def label_dedup(sentence, label_list):
    occurances = []
    for label in label_list:
        for match in re.finditer(re.escape(label), sentence):
            span = match.span()
            if span is not None:
                occurances.append((label, span[0], span[1]))
                
    occurances.sort(key=lambda x:(x[1],0-len(x[0])))
#     print(occurances)
    dedup_list = set()
    ancher = None
    for index, o in enumerate(occurances):
        if ancher is None:
            ancher = o
            dedup_list.add(ancher[0])
        else:
            if o[1] >= ancher[2]: # Not overlapping with ancher, update ancher
                to_be_removed = set()
                need_to_update = True
                for d in dedup_list:
                    if o[0] in d:
                        need_to_update = False
                        break
                    elif d in o[0]:
                        to_be_removed.add(d)
                        
                if len(to_be_removed) > 0:
                    for rm in to_be_removed:
                        dedup_list.remove(rm)
                        
                if need_to_update:
                    ancher = o
                    dedup_list.add(o[0])
                
    if ancher is not None:
        dedup_list.add(ancher[0])
                
    return dedup_list

In [None]:
import json
import nltk
import csv
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

sentence_df = pd.read_csv(
    './ner_sentences.csv',
    index_col=None,
    header=None,
    keep_default_na=False,
    names=['fid', 'sid', 'sentence', 'labels'],
    dtype={
        'fid': 'str',
        'sid': 'str', 
        'sentence': 'str',
        'labels': 'str',
    },
)

# Dedup overlapping labels for each sentence
dedup_labels = []
processed = 0
for index, row in sentence_df.iterrows():
    processed += 1
    if processed % 50000 == 0:
        print(f'Processed {processed} documents')

    sid = row['sid']
    labels = row['labels']
    sentence = row['sentence']
    if labels is None or labels == '':
        dedup_labels.append(None)
    else:
        dedup_label_list = label_dedup(sentence, labels.split('||'))
        dedup_labels.append('||'.join(dedup_label_list))
        
sentence_df['dedup_labels'] = dedup_labels

# Split training and testing
x_train,x_test = train_test_split(sentence_df, test_size=0.10, random_state=2021)
print(len(x_train))
print(len(x_test))

x_train.to_csv('./sentence_training.csv', na_rep='', index=False)
x_test.to_csv('./sentence_test.csv', na_rep='', index=False)

In [None]:
train_df = pd.read_csv(
    './sentence_training.csv',
    index_col=None,
    header='infer',
    keep_default_na=False,
    dtype={
        'fid': 'str',
        'sid': 'str', 
        'sentence': 'str',
        'labels': 'str',
        'dedup_labels': 'str'
    },
)

In [None]:
def get_occurances(word_tokens, s):
#     print(list_of_words)
#     print(s)
    ret = []
    start_pos = 0
    while True:
        try:
            i = word_tokens.index(s, start_pos)
            ret.append(i)
            start_pos = i+1
        except ValueError:
            return ret

# Given a sentence as a list of words and a list of words for dataset_label, update the given tag sequence 
def update_tags(word_tokens, label_tokens, tags):
    try:
        first_word_indice = get_occurances(word_tokens, label_tokens[0])
    #     print(first_word_indice)
        for m in first_word_indice:
            match = True
            p = m
            for i in range(len(label_tokens)):
                if p < len(word_tokens) and word_tokens[p] == label_tokens[i]:
                    p+=1
                else:
                    match = False
                    break
            if match:
    #             print(f'match {m}')
                assert tags[m] == 'O', 'Label overlap!'
                tags[m] = 'B-D'
                m+=1
                for i in range(1, len(label_tokens)):
                    assert tags[m] == 'O', 'Label overlap!'
                    tags[m] = 'I-D'
                    m+=1
        return tags
    except IndexError as e:
        print(e)
        print(words)
        print(words_label)

def generate_word_tag_dataset(input_file_path, output_file_path):
    df = pd.read_csv(
        input_file_path,
        index_col=None,
        keep_default_na=False,
        header='infer',
        dtype={
            'fid': 'str',
            'sid': 'str', 
            'sentence': 'str',
            'labels': 'str',
            'dedup_labels': 'str'
        },
    )
    
    with open(output_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
    
        for index, row in df.iterrows():
            sid = row['sid']
            sentence = row['sentence']
            labels = row['dedup_labels']

            # Create tags for each word in a sentence
            word_tokens = word_tokenize(sentence)
            tags = ['O']*len(word_tokens)
            
            if labels is not None and labels > '': # If there are labels
                for l in labels.split('||'):
                    # Update the tag sequence with the current label
                    label_tokens = word_tokenize(l)
                    try:
                        tags = update_tags(word_tokens, label_tokens, tags)
                    except AssertionError as ae:
                        print(f'{sid} label overlap!')
                        
            for (w,t) in zip(word_tokens, tags):
                writer.writerow([f'{sid}',w,t])

generate_word_tag_dataset('./sentence_training.csv', './ner_training.csv')
generate_word_tag_dataset('./sentence_test.csv', './ner_test.csv')

In [None]:
ner_train_df = pd.read_csv(
    './ner_training.csv',
    index_col=None,
    header=None,
    names=['sid', 'word', 'tag'],
    keep_default_na=False,
    dtype={
        'sid': 'str', 
        'word': 'str',
        'tag': 'str',
    },
)