* TRAIN JSON: 8000 Examples
* TEST JSON: 2717 Examples (ID 8001 to 10717)

In [4]:
data_dir = r"D:\59 Github Projects\0014 RelExtraction\RIFRE\datasets\data\new_semeval"
all_filenames = ['train.json', 'test.json']

In [17]:
import json
import os
import csv
import pandas as pd
import numpy as np


def open_json(json_file_path, data_format=list):
    if data_format==dict or data_format=='dict':
        with open(json_file_path) as json_file:
            data = json.load(json_file)
    elif data_format==list or data_format=='list':
        data = []
        for line in open(json_file_path, encoding='utf-8'):
            data.append(json.loads(line))
    elif data_format==pd.DataFrame or data_format=='pd.DataFrame':
        data = pd.read_json(json_file_path, orient="records", lines=True)
    else:
        raise NotImplementedError
    return data

file = all_filenames[1]
fn = os.path.join(data_dir, file)
# fn = r"D:\59 Github Projects\0014 RelExtraction\RIFRE\datasets\data\new_semeval\train.json"
original_data = open_json(fn, data_format=dict)
original_data

[{'id': '8001',
  'tokens': ['The',
   'most',
   'common',
   'audits',
   'were',
   'about',
   'waste',
   'and',
   'recycling',
   '.'],
  'label': 'Message-Topic(e1,e2)',
  'entities': [[3, 4], [6, 7]]},
 {'id': '8002',
  'tokens': ['The', 'company', 'fabricates', 'plastic', 'chairs', '.'],
  'label': 'Product-Producer(e2,e1)',
  'entities': [[4, 5], [1, 2]]},
 {'id': '8003',
  'tokens': ['The',
   'school',
   'master',
   'teaches',
   'the',
   'lesson',
   'with',
   'a',
   'stick',
   '.'],
  'label': 'Instrument-Agency(e2,e1)',
  'entities': [[8, 9], [2, 3]]},
 {'id': '8004',
  'tokens': ['The',
   'suspect',
   'dumped',
   'the',
   'dead',
   'body',
   'into',
   'a',
   'local',
   'reservoir',
   '.'],
  'label': 'Entity-Destination(e1,e2)',
  'entities': [[5, 6], [9, 10]]},
 {'id': '8005',
  'tokens': ['Avian',
   'influenza',
   'is',
   'an',
   'infectious',
   'disease',
   'of',
   'birds',
   'caused',
   'by',
   'type',
   'A',
   'strains',
   'of',
   'th

In [12]:
sentences = []
for v in original_data:
    sentences.append(' '.join(v['tokens']))
len(sentences)

2717

In [16]:
for v in original_data:
    if len(v['entities'])>2:
        print(v)

In [20]:
[a,b],[c,d]=[[1,2],[3,4]]
a

1

In [13]:
len(set(sentences))

2715

In [33]:
from collections import defaultdict
import re


def clean(t):
    """
    Remove whitespaces within marked words
    """
    t = re.sub("<ARG0> ","<ARG0>", t)
    t = re.sub(" </ARG0>","</ARG0>", t)
    t = re.sub("<ARG1> ","<ARG1>", t)
    t = re.sub(" </ARG1>","</ARG1>", t)
    return t 


def run(file):
    # Open File
    fn = os.path.join(data_dir, file)
    original_data = open_json(fn, data_format=dict)
    
    def get_s_e_t_list(s_head=[], e_head=[], s_tail=[], e_tail=[]):
        s_locs, e_locs, tags = [], [], []
        if isinstance(s_head,(np.ndarray,list)):
            s_locs.extend(s_head)
            e_locs.extend(e_head)
            tags.extend(['<ARG0>']*len(s_head))
        else:
            s_locs.append(s_head)
            e_locs.append(e_head)
            tags.append('<ARG0>')

        if isinstance(s_tail,(np.ndarray,list)):
            s_locs.extend(s_tail)
            e_locs.extend(e_tail)
            tags.extend(['<ARG1>']*len(s_tail))
        else:
            s_locs.append(s_tail)
            e_locs.append(e_tail)
            tags.append('<ARG1>')
        return sorted(zip(s_locs, e_locs, tags))

    # Generate Examples
    corpus = 'semeval2010t8'
    cols = ['corpus','doc_id','sent_id','eg_id','index','text','text_w_pairs','seq_label','pair_label','context','num_sents']
    data = []
    text_to_sentid = {}
    sentid_counter = defaultdict(int)
    max_sentid = 0
    for v in original_data:

        # Get Sentence
        text = ' '.join(v['tokens'])
        
        # Check if we have sentence
        if text in text_to_sentid.keys():
            # If yes: Get sentid
            sentid = text_to_sentid[text]
        else:
            # If no: Create index
            text_to_sentid[text] = max_sentid
            sentid = max_sentid
            # Prepare for next round
            max_sentid += 1
        num_eg_for_this_sentid = sentid_counter[sentid]
        
        identifiers = [corpus,os.path.basename(fn),str(sentid),str(num_eg_for_this_sentid)]
        unique_index = '_'.join(identifiers)

        label = v['label']
        seq_label = pair_label = 1 if 'Cause-Effect' in label else 0

        # There is always only two entities
        [s_head, e_head], [s_tail, e_tail] = v['entities']
        added_t = 0
        text_w_pairs = v['tokens']
        for s,e,t in get_s_e_t_list(s_head, e_head, s_tail, e_tail):
            s += added_t
            e += added_t
            end_t = t[0]+'/'+t[1:]
            text_w_pairs = text_w_pairs[:s]+[t]+text_w_pairs[s:e]+[end_t]+text_w_pairs[e:]
            added_t+=2
        text_w_pairs = ' '.join(text_w_pairs)
        text_w_pairs = clean(text_w_pairs)
        
        data.append(
            identifiers+[
                unique_index,
                text.strip(),
                text_w_pairs.strip(),
                seq_label,
                pair_label,
                '',1
            ]
        )
        
        sentid_counter[sentid]+=1
    
    data = pd.DataFrame(data, columns=cols)
    data['sent_id'] = data['sent_id'].astype(str)
    data['seq_label'] = data.groupby(['corpus','doc_id','sent_id'])['seq_label'].transform('max')
        
    return data

file = all_filenames[1]
data = run(file)
data

Unnamed: 0,corpus,doc_id,sent_id,eg_id,index,text,text_w_pairs,seq_label,pair_label,context,num_sents
0,semeval2010t8,test.json,0,0,semeval2010t8_test.json_0_0,The most common audits were about waste and re...,The most common <ARG0>audits</ARG0> were about...,0,0,,1
1,semeval2010t8,test.json,1,0,semeval2010t8_test.json_1_0,The company fabricates plastic chairs .,The <ARG1>company</ARG1> fabricates plastic <A...,0,0,,1
2,semeval2010t8,test.json,2,0,semeval2010t8_test.json_2_0,The school master teaches the lesson with a st...,The school <ARG1>master</ARG1> teaches the les...,0,0,,1
3,semeval2010t8,test.json,3,0,semeval2010t8_test.json_3_0,The suspect dumped the dead body into a local ...,The suspect dumped the dead <ARG0>body</ARG0> ...,0,0,,1
4,semeval2010t8,test.json,4,0,semeval2010t8_test.json_4_0,Avian influenza is an infectious disease of bi...,Avian <ARG1>influenza</ARG1> is an infectious ...,1,1,,1
...,...,...,...,...,...,...,...,...,...,...,...
2712,semeval2010t8,test.json,2710,0,semeval2010t8_test.json_2710_0,"After seating all the idols , which itself tak...","After seating all the idols , which itself tak...",0,0,,1
2713,semeval2010t8,test.json,2711,0,semeval2010t8_test.json_2711_0,The minister attributed the slow production of...,The minister attributed the slow production of...,0,0,,1
2714,semeval2010t8,test.json,2712,0,semeval2010t8_test.json_2712_0,The umbrella frame is provided with a movable ...,The <ARG1>umbrella</ARG1> <ARG0>frame</ARG0> i...,0,0,,1
2715,semeval2010t8,test.json,2713,0,semeval2010t8_test.json_2713_0,Manos: The Hands of Fate is a low-budget horro...,Manos: The Hands of Fate is a low-budget horro...,0,0,,1


In [36]:
data = pd.DataFrame()
for file in all_filenames:
    
    df = run(file)
    data = pd.concat([data, df])

data

Unnamed: 0,corpus,doc_id,sent_id,eg_id,index,text,text_w_pairs,seq_label,pair_label,context,num_sents
0,semeval2010t8,train.json,0,0,semeval2010t8_train.json_0_0,The system as described above has its greatest...,The system as described above has its greatest...,0,0,,1
1,semeval2010t8,train.json,1,0,semeval2010t8_train.json_1_0,The child was carefully wrapped and bound into...,The <ARG1>child</ARG1> was carefully wrapped a...,0,0,,1
2,semeval2010t8,train.json,2,0,semeval2010t8_train.json_2_0,The author of a keygen uses a disassembler to ...,The <ARG1>author</ARG1> of a keygen uses a <AR...,0,0,,1
3,semeval2010t8,train.json,3,0,semeval2010t8_train.json_3_0,A misty ridge uprises from the surge .,A misty <ARG1>ridge</ARG1> uprises from the <A...,0,0,,1
4,semeval2010t8,train.json,4,0,semeval2010t8_train.json_4_0,The student association is the voice of the un...,The <ARG0>student</ARG0> <ARG1>association</AR...,0,0,,1
...,...,...,...,...,...,...,...,...,...,...,...
2712,semeval2010t8,test.json,2710,0,semeval2010t8_test.json_2710_0,"After seating all the idols , which itself tak...","After seating all the idols , which itself tak...",0,0,,1
2713,semeval2010t8,test.json,2711,0,semeval2010t8_test.json_2711_0,The minister attributed the slow production of...,The minister attributed the slow production of...,0,0,,1
2714,semeval2010t8,test.json,2712,0,semeval2010t8_test.json_2712_0,The umbrella frame is provided with a movable ...,The <ARG1>umbrella</ARG1> <ARG0>frame</ARG0> i...,0,0,,1
2715,semeval2010t8,test.json,2713,0,semeval2010t8_test.json_2713_0,Manos: The Hands of Fate is a low-budget horro...,Manos: The Hands of Fate is a low-budget horro...,0,0,,1


In [38]:
from collections import Counter

print('All Examples')
print('Seq Level:', Counter(data['seq_label'])) # if sentence level causality exists
print('Pair Level:',Counter(data['pair_label'])) # if ARG0-ARG1 pair level causality exists
print('Seq Level (Unique):',Counter(data.drop_duplicates(subset=['corpus','doc_id','sent_id'])['seq_label']))

print('\nSingle Sentence Examples')
print('Seq Level:', Counter(data.loc[data['num_sents']==1,'seq_label'])) # if sentence level causality exists
print('Pair Level:', Counter(data.loc[data['num_sents']==1,'pair_label'])) # if ARG0-ARG1 pair level causality exists
print('Seq Level (Unique):',Counter(data.loc[data['num_sents']==1].drop_duplicates(subset=['corpus','doc_id','sent_id'])['seq_label']))

All Examples
Seq Level: Counter({0: 9386, 1: 1331})
Pair Level: Counter({0: 9386, 1: 1331})
Seq Level (Unique): Counter({0: 9363, 1: 1327})

Single Sentence Examples
Seq Level: Counter({0: 9386, 1: 1331})
Pair Level: Counter({0: 9386, 1: 1331})
Seq Level (Unique): Counter({0: 9363, 1: 1327})


In [37]:
data.to_csv('cleaned/semeval2010t8.csv', index=False,encoding='utf-8-sig')