# 3. Collect ATOMIC-Extract

In [None]:
import pandas as pd
import json

import spacy
from tqdm import tqdm

## 3.1 Load ATOMIC Dataset

In [16]:
colnames = ['head', 'relation', 'tail']
atomic_train = pd.read_csv('./atomic2020/train.tsv', sep='\t', names=colnames, header=None)
display(atomic_train.head())

Unnamed: 0,head,relation,tail
0,PersonX abandons ___ altogether,oEffect,none
1,PersonX abandons ___ altogether,oEffect,none
2,PersonX abandons ___ altogether,oReact,dejected
3,PersonX abandons ___ altogether,oWant,none
4,PersonX abandons ___ altogether,oWant,none


In [17]:
atomic_dev = pd.read_csv('./atomic2020/dev.tsv', sep='\t', names=colnames, header=None)
display(atomic_dev.head())

Unnamed: 0,head,relation,tail
0,PersonX 'd better go,oEffect,none
1,PersonX 'd better go,oEffect,none
2,PersonX 'd better go,oReact,none
3,PersonX 'd better go,oReact,none
4,PersonX 'd better go,oWant,none


In [18]:
atomic_test = pd.read_csv('./atomic2020/test.tsv', sep='\t', names=colnames, header=None)
display(atomic_test.head())

Unnamed: 0,head,relation,tail
0,PersonX abuses PersonX's power,oEffect,are told what to do
1,PersonX abuses PersonX's power,oEffect,given unfair consequences or punishment
2,PersonX abuses PersonX's power,oEffect,reach out for help
3,PersonX abuses PersonX's power,oEffect,none
4,PersonX abuses PersonX's power,oReact,humiliated


## 3.2 Load PIQA-Verb

In [19]:
piqa_verbs = set(line.strip() for line in open('./outputs/PIQA-Verb/full.txt'))
print(len(piqa_verbs))

3100


## 3.3 Collect ATOMIC-Extract

### 3.3.1 Find ATOMIC Verbs by ```en_core_web_trf``` in spaCy

In [20]:
atomic_train_length = len(atomic_train)
print(atomic_train_length)

1076880


In [21]:
atomic_dev_length = len(atomic_dev)
print(atomic_dev_length)

102024


In [22]:
atomic_test_length = len(atomic_test)
print(atomic_test_length)

152209


In [23]:
# check if tail is float
display(atomic_train[atomic_train['tail'].apply(lambda x: isinstance(x, float))])

Unnamed: 0,head,relation,tail
6000,PersonX affects PersonY's health,oReact,
6018,PersonX affects PersonY's health,xReact,
6048,PersonX affects PersonY's life,xReact,
17115,PersonX asks PersonY to let,oReact,
36675,PersonX binds together the ___,xReact,
...,...,...,...
1063998,PersonX uses PersonX's internet,HinderedBy,
1064418,PersonX hears a knock,HinderedBy,
1064740,PersonX plays catch,HinderedBy,
1064868,PersonX shows it to PersonY's parents,HinderedBy,


In [24]:
nlp = spacy.load('en_core_web_trf')

atomic_train['verbs_head'] = None
atomic_train['verbs_tail'] = None
for row in tqdm(atomic_train.itertuples(), total=atomic_train_length):
    if (not isinstance(row.head, str)) or (not isinstance(row.tail, str)):
        continue

    col1 = nlp(row.head)
    col2 = nlp(row.tail)

    tmp1 = set([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp2 = set([token.lemma_ for token in col2 if token.pos_ == 'VERB'])

    atomic_train.at[row.Index, 'verbs_head'] = tmp1
    atomic_train.at[row.Index, 'verbs_tail'] = tmp2

display(atomic_train.head())

100%|██████████| 1076880/1076880 [2:38:57<00:00, 112.91it/s]   


Unnamed: 0,head,relation,tail,verbs_head,verbs_tail
0,PersonX abandons ___ altogether,oEffect,none,{abandon},{}
1,PersonX abandons ___ altogether,oEffect,none,{abandon},{}
2,PersonX abandons ___ altogether,oReact,dejected,{abandon},{deject}
3,PersonX abandons ___ altogether,oWant,none,{abandon},{}
4,PersonX abandons ___ altogether,oWant,none,{abandon},{}


In [25]:
atomic_dev['verbs_head'] = None
atomic_dev['verbs_tail'] = None
for row in tqdm(atomic_dev.itertuples(), total=atomic_dev_length):
    if (not isinstance(row.head, str)) or (not isinstance(row.tail, str)):
        continue
    
    col1 = nlp(row.head)
    col2 = nlp(row.tail)

    tmp1 = set([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp2 = set([token.lemma_ for token in col2 if token.pos_ == 'VERB'])

    atomic_dev.at[row.Index, 'verbs_head'] = tmp1
    atomic_dev.at[row.Index, 'verbs_tail'] = tmp2

100%|██████████| 102024/102024 [09:00<00:00, 188.62it/s]


In [26]:
atomic_test['verbs_head'] = None
atomic_test['verbs_tail'] = None
for row in tqdm(atomic_test.itertuples(), total=atomic_test_length):
    if (not isinstance(row.head, str)) or (not isinstance(row.tail, str)):
        continue
    
    col1 = nlp(row.head)
    col2 = nlp(row.tail)

    tmp1 = set([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp2 = set([token.lemma_ for token in col2 if token.pos_ == 'VERB'])

    atomic_test.at[row.Index, 'verbs_head'] = tmp1
    atomic_test.at[row.Index, 'verbs_tail'] = tmp2

100%|██████████| 152209/152209 [13:56<00:00, 182.06it/s]


### 3.3.2 Find ATOMIC Data which including PIQA-Verb

In [27]:
atomic_train['match'] = None
for row in tqdm(atomic_train.itertuples(), total=atomic_train_length):
    if (not isinstance(row.verbs_head, set)) or (not isinstance(row.verbs_tail, set)) or (not isinstance(piqa_verbs, set)):
        continue

    if (row.verbs_head & piqa_verbs) or (row.verbs_tail & piqa_verbs):
        atomic_train.at[row.Index, 'match'] = True
    
    else:
        atomic_train.at[row.Index, 'match'] = False
        
display(atomic_train[atomic_train['match'] == True].head())
print(len(atomic_train[atomic_train['match'] == True]))
print(len(atomic_train[atomic_train['match'] == False]))

100%|██████████| 1076880/1076880 [00:04<00:00, 243739.69it/s]


Unnamed: 0,head,relation,tail,verbs_head,verbs_tail,match
0,PersonX abandons ___ altogether,oEffect,none,{abandon},{},True
1,PersonX abandons ___ altogether,oEffect,none,{abandon},{},True
2,PersonX abandons ___ altogether,oReact,dejected,{abandon},{deject},True
3,PersonX abandons ___ altogether,oWant,none,{abandon},{},True
4,PersonX abandons ___ altogether,oWant,none,{abandon},{},True


1005246
71489


In [28]:
atomic_dev['match'] = None
for row in tqdm(atomic_dev.itertuples(), total=atomic_dev_length):
    if (not isinstance(row.verbs_head, set)) or (not isinstance(row.verbs_tail, set)) or (not isinstance(piqa_verbs, set)):
        continue
    
    if (row.verbs_head & piqa_verbs) or (row.verbs_tail & piqa_verbs):
        atomic_dev.at[row.Index, 'match'] = True
    
    else:
        atomic_dev.at[row.Index, 'match'] = False

display(atomic_dev[atomic_dev['match'] == True].head())
print(len(atomic_dev[atomic_dev['match'] == True]))
print(len(atomic_dev[atomic_dev['match'] == False]))

100%|██████████| 102024/102024 [00:00<00:00, 245810.77it/s]


Unnamed: 0,head,relation,tail,verbs_head,verbs_tail,match
0,PersonX 'd better go,oEffect,none,{go},{},True
1,PersonX 'd better go,oEffect,none,{go},{},True
2,PersonX 'd better go,oReact,none,{go},{},True
3,PersonX 'd better go,oReact,none,{go},{},True
4,PersonX 'd better go,oWant,none,{go},{},True


98328
3677


In [29]:
atomic_test['match'] = None
for row in tqdm(atomic_test.itertuples(), total=atomic_test_length):
    if (not isinstance(row.verbs_head, set)) or (not isinstance(row.verbs_tail, set)) or (not isinstance(piqa_verbs, set)):
        continue
    
    if (row.verbs_head & piqa_verbs) or (row.verbs_tail & piqa_verbs):
        atomic_test.at[row.Index, 'match'] = True
    
    else:
        atomic_test.at[row.Index, 'match'] = False

display(atomic_test[atomic_test['match'] == True].head())
print(len(atomic_test[atomic_test['match'] == True]))
print(len(atomic_test[atomic_test['match'] == False]))

100%|██████████| 152209/152209 [00:00<00:00, 239936.87it/s]


Unnamed: 0,head,relation,tail,verbs_head,verbs_tail,match
0,PersonX abuses PersonX's power,oEffect,are told what to do,{abuse},"{tell, do}",True
1,PersonX abuses PersonX's power,oEffect,given unfair consequences or punishment,{abuse},{give},True
2,PersonX abuses PersonX's power,oEffect,reach out for help,{abuse},{reach},True
3,PersonX abuses PersonX's power,oEffect,none,{abuse},{},True
4,PersonX abuses PersonX's power,oReact,humiliated,{abuse},{humiliate},True


142916
9276


### 3.3.3 Save ATOMIC-Extract

In [30]:
atomic_train[atomic_train['match'] == True][['head', 'relation', 'tail']].to_csv(
    "./outputs/ATOMIC-Extract/train.tsv", sep='\t', index=False, header=False)

In [31]:
atomic_dev[atomic_dev['match'] == True][['head', 'relation', 'tail']].to_csv(
    "./outputs/ATOMIC-Extract/dev.tsv", sep='\t', index=False, header=False)

In [32]:
atomic_test[atomic_test['match'] == True][['head', 'relation', 'tail']].to_csv(
    "./outputs/ATOMIC-Extract/test.tsv", sep='\t', index=False, header=False)

### 3.3.4 Save ATOMIC-Extract-Full

In [33]:
atomic_train[atomic_train['match'] == True]['head', 'relation', 'tail', 'verbs_head', 'verbs_tail'].to_csv(
    "./outputs/ATOMIC-Extract/train_full.tsv", sep='\t', index=False, header=False)

In [34]:
atomic_dev[atomic_dev['match'] == True]['head', 'relation', 'tail', 'verbs_head', 'verbs_tail'].to_csv(
    "./outputs/ATOMIC-Extract/dev_full.tsv", sep='\t', index=False, header=False)

In [35]:
atomic_test[atomic_test['match'] == True].to_csv(
    "./outputs/ATOMIC-Extract/test_full.tsv", sep='\t', index=False, header=False)