In [1]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

Looking in links: file:///kaggle/input/coleridge-packages/packages/datasets
Processing /kaggle/input/coleridge-packages/packages/datasets/datasets-1.5.0-py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/tqdm-4.49.0-py2.py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/xxhash-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/huggingface_hub-0.0.7-py3-none-any.whl
Installing collected packages: tqdm, xxhash, huggingface-hub, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.55.1
    Uninstalling tqdm-4.55.1:
      Successfully uninstalled tqdm-4.55.1
Successfully installed datasets-1.5.0 huggingface-hub-0.0.7 tqdm-4.49.0 xxhash-2.0.0
Processing /kaggle/input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Processing /kaggle/input/coleridge-packages/tokenizers-

In [2]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

MAX_SAMPLE = None

In [3]:
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
train_df = train_df[:MAX_SAMPLE]

In [4]:
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in train_df['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [5]:
sub_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

test_files = glob.glob("../input/coleridgeinitiative-show-us-the-data/test/*.json")

# paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
# for paper_id in sub_df['Id']:
#     with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
#         paper = json.load(f)
#         papers[paper_id] = paper

In [6]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

In [7]:
test_df = pd.DataFrame()

for test_file in test_files:
    text_data=pd.read_json(test_file)
    text_data.insert(0,'id', test_file.split('/')[-1].split('.')[0]) 
    test_df = pd.concat([test_df, text_data])

In [8]:
title1 = [temp_title.lower() for temp_title in train_df['dataset_label'].unique()]
label1 = [temp_label.lower() for temp_label in train_df['dataset_title'].unique()]
clabel1 = [temp_cleanLabel.lower() for temp_cleanLabel in train_df['cleaned_label'].unique()]

all_labels1 = list(set(list(map(clean_text, set(title1))) + list(map(clean_text, set(label1)))))

print(f'Total unique labels: {len(all_labels1)}')

Total unique labels: 133


In [9]:
test_df['cleaned_text'] = test_df.apply(lambda txt: clean_text(txt['text']).strip(), axis =1)

In [10]:
test_group = pd.DataFrame(test_df.groupby(['id'])['cleaned_text'].agg(' '.join))

In [11]:
result = test_group['cleaned_text'].apply(lambda txt: '|'.join({label for label in all_labels1 if label in txt})).reset_index()
result.columns =['Id', 'PredictionString']

In [12]:
literal_preds = []

for pred in result['PredictionString']:
    literal_preds.append(pred)

In [13]:
# literal_preds = []

# for paper_id in sample_submission['Id']:
#     paper = papers[paper_id]
#     text_1 = '. '.join(section['text'] for section in paper).lower()
#     text_2 = totally_clean_text(text_1)
    
#     labels = set()
#     for label in all_labels:
#         if label in text_1 or label in text_2:
#             labels.add(clean_text(label))
    
#     literal_preds.append('|'.join(labels))


In [14]:
literal_preds[:5]

['alzheimer s disease neuroimaging initiative adni|adni',
 'nces common core of data|common core of data|trends in international mathematics and science study',
 'slosh model|noaa storm surge inundation|sea lake and overland surges from hurricanes',
 'rural urban continuum codes']

In [15]:
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

PREDICT_BATCH = 64000 

PRETRAINED_PATH = '../input/coleridge-bert-models/output'
TEST_INPUT_SAVE_PATH = './input_data'
TEST_NER_DATA_FILE = 'test_ner_input.json'
TRAIN_PATH = '../input/coleridge-bert-models/train_ner.json'
VAL_PATH = '../input/coleridge-bert-models/train_ner.json'

PREDICTION_SAVE_PATH = './pred'
PREDICTION_FILE = 'test_predictions.txt'

In [16]:
train = train_df.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train_df)}')

No. grouped training rows: 19661


In [17]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

In [18]:
test_rows = [] # test data in NER format
paper_length = [] # store the number of sentences each paper has

for paper_id in result['Id']:
    # load paper
    paper = papers[paper_id]
    
    # extract sentences
    sentences = [clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.')
                ]
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        
    # collect all sentences in json
    for sentence in sentences:
        sentence_words = sentence.split()
        dummy_tags = ['O']*len(sentence_words)
        test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})
    
    # track which sentence belongs to which data point
    paper_length.append(len(sentences))
    
print(f'total number of sentences: {len(test_rows)}')

total number of sentences: 367


In [19]:
os.environ["MODEL_PATH"] = f"{PRETRAINED_PATH}"
os.environ["TRAIN_FILE"] = f"{TRAIN_PATH}"
os.environ["VALIDATION_FILE"] = f"{VAL_PATH}"
os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"

In [20]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# make necessart directories and files
os.makedirs(TEST_INPUT_SAVE_PATH, exist_ok=True)

In [21]:
def bert_predict():
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path "$MODEL_PATH" \
    --train_file "$TRAIN_FILE" \
    --validation_file "$VALIDATION_FILE" \
    --test_file "$TEST_FILE" \
    --output_dir "$OUTPUT_DIR" \
    --report_to 'none' \
    --seed 123 \
    --do_predict

In [22]:
bert_outputs = []

for batch_begin in range(0, len(test_rows), PREDICT_BATCH):
    # write data rows to input file
    with open(f'{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}', 'w') as f:
        for row in test_rows[batch_begin:batch_begin+PREDICT_BATCH]:
            json.dump(row, f)
            f.write('\n')
    
    # remove output dir
    !rm -r "$OUTPUT_DIR"
    
    # do predict
    bert_predict()
    
    # read predictions
    with open(f'{PREDICTION_SAVE_PATH}/{PREDICTION_FILE}') as f:
        this_preds = f.read().split('\n')[:-1]
        bert_outputs += [pred.split() for pred in this_preds]

rm: cannot remove './pred': No such file or directory
2021-06-03 12:41:54.755225: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-642b227a9a1105c9/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...
Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-642b227a9a1105c9/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.
[INFO|configuration_utils.py:470] 2021-06-03 12:42:47,024 >> loading configuration file ../input/coleridge-bert-models/output/config.json
[INFO|configuration_utils.py:508] 2021-06-03 12:42:47,024 >> Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    

### Restore Dataset labels from predictions

In [23]:
# get test sentences
test_sentences = [row['tokens'] for row in test_rows]

del test_rows

In [24]:
bert_dataset_labels = [] # store all dataset labels for each publication

for length in paper_length:
    labels = set()
    for sentence, pred in zip(test_sentences[:length], bert_outputs[:length]):
        curr_phrase = ''
        for word, tag in zip(sentence, pred):
            if tag == 'B': # start a new phrase
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
                curr_phrase = word
            elif tag == 'I' and curr_phrase: # continue the phrase
                curr_phrase += ' ' + word
            else: # end last phrase (if any)
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
        # check if the label is the suffix of the sentence
        if curr_phrase:
            labels.add(curr_phrase)
            curr_phrase = ''
    
    # record dataset labels for this publication
    bert_dataset_labels.append(labels)
    
    del test_sentences[:length], bert_outputs[:length]

In [25]:
bert_dataset_labels[:5]

[{'Alzheimer s Disease Neuroimaging Initiative ADNI'},
 {'Trends in International Mathematics and Science Study'},
 set(),
 set()]

### Filter based on Jaccard score and clean

In [26]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

filtered_bert_labels = []

for labels in bert_dataset_labels:
    filtered = []
    
    for label in sorted(labels, key=len):
        label = clean_text(label)
        if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered):
            filtered.append(label)
    
    filtered_bert_labels.append('|'.join(filtered))

In [27]:
filtered_bert_labels[:5]

['alzheimer s disease neuroimaging initiative adni',
 'trends in international mathematics and science study',
 '',
 '']

# Aggregate final predictions and write submission file

In [28]:
final_predictions = []
for literal_match, bert_pred in zip(literal_preds, filtered_bert_labels):
    if literal_match:
        final_predictions.append(literal_match)
    else:
        final_predictions.append(bert_pred)

In [29]:
result['PredictionString'] = final_predictions
result.head()

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,alzheimer s disease neuroimaging initiative ad...
1,2f392438-e215-4169-bebf-21ac4ff253e1,nces common core of data|common core of data|t...
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,slosh model|noaa storm surge inundation|sea la...
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes


In [30]:
result.to_csv(f'submission.csv', index=False)