# 1. Extract Verb in PIQA Dataset

In [8]:
import pandas as pd
import json

import spacy
from tqdm import tqdm

## 1.1 Open PIQA Dataset

In [9]:
tmp = []
with open('./PIQA/train.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

piqa_train = pd.DataFrame(tmp)
display(piqa_train.head())

Unnamed: 0,goal,sol1,sol2
0,"When boiling butter, when it's ready, you can",Pour it onto a plate,Pour it into a jar
1,"To permanently attach metal legs to a chair, y...",Weld the metal together to get it to stay firm...,Nail the metal together to get it to stay firm...
2,how do you indent something?,leave a space before starting the writing,press the spacebar
3,how do you shake something?,move it up and down and side to side quickly.,stir it very quickly.
4,Clean tires,"Pour water, cape off caked on dirt. Use speed...","Pour water, scrape off caked on dirt. Use a st..."


In [10]:
tmp = []
with open('./PIQA/valid.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

piqa_valid = pd.DataFrame(tmp)
display(piqa_valid.head())

Unnamed: 0,goal,sol1,sol2
0,How do I ready a guinea pig cage for it's new ...,Provide the guinea pig with a cage full of a f...,Provide the guinea pig with a cage full of a f...
1,dresser,replace drawer with bobby pin,"finish, woodgrain with bobby pin"
2,To fight Ivan Drago in Rocky for sega master s...,Drago isn't in this game because it was releas...,You have to defeat Apollo Creed and Clubber La...
3,Make outdoor pillow.,Blow into tin can and tie with rubber band.,Blow into trash bag and tie with rubber band.
4,ice box,will turn into a cooler if you add water to it,will turn into a cooler if you add soda to it


In [11]:
tmp = []
with open('./PIQA/tests.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

piqa_tests = pd.DataFrame(tmp)
display(piqa_tests.head())

Unnamed: 0,goal,sol1,sol2
0,how do you puncture a vein?,hit it at the wrong angle and make it bleed.,pop it.
1,hands,is used to put on shoe,is used to put on milk jug
2,What ingredients do I need to make a shortcrus...,"To make pie crust, you will need flour, sugar,...","To make pie crust, you will need flour, sugar,..."
3,roast broccoli,Preheat oven to 450 degrees F. Toss the bro...,Preheat oven to 450 degrees F. Toss the bro...
4,To crimp the edges of the patsy crust.,Use a knife to crimp the edges.,Use a fork to crimp the edges


## 1.2 Extract PIQA Verbs

```$ python -m spacy download en_core_web_sm```

```$ python -m spacy download en_core_web_trf```

In [12]:
piqa_train_length = len(piqa_train)
print(piqa_train_length)

16113


In [13]:
piqa_valid_length = len(piqa_valid)
print(piqa_valid_length)

1838


In [14]:
piqa_tests_length = len(piqa_tests)
print(piqa_tests_length)

3084


### 1.2.1 Extract Verbs by ```en_core_web_sm```

In [15]:
nlp = spacy.load('en_core_web_sm')

piqa_train['verbs'] = None
piqa_train_verbs = set()

for row in tqdm(piqa_train.itertuples(), total=piqa_train_length):
    col1 = nlp(row.goal)
    col2 = nlp(row.sol1)
    col3 = nlp(row.sol2)
    
    tmp = set()
    tmp.update([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col2 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col3 if token.pos_ == 'VERB'])

    piqa_train.at[row.Index, 'verbs'] = tmp
    piqa_train_verbs.update(tmp)

print(len(piqa_train_verbs))

100%|██████████| 16113/16113 [03:41<00:00, 72.77it/s]

2802





In [16]:
piqa_valid['verbs'] = None
piqa_valid_verbs = set()

for row in tqdm(piqa_valid.itertuples(), total=piqa_valid_length):
    col1 = nlp(row.goal)
    col2 = nlp(row.sol1)
    col3 = nlp(row.sol2)
    
    tmp = set()
    tmp.update([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col2 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col3 if token.pos_ == 'VERB'])

    piqa_valid.at[row.Index, 'verbs'] = tmp
    piqa_valid_verbs.update(tmp)

print(len(piqa_valid_verbs))

100%|██████████| 1838/1838 [00:24<00:00, 75.89it/s]

1112





In [17]:
piqa_tests['verbs'] = None
piqa_tests_verbs = set()

for row in tqdm(piqa_tests.itertuples(), total=piqa_tests_length):
    col1 = nlp(row.goal)
    col2 = nlp(row.sol1)
    col3 = nlp(row.sol2)
    
    tmp = set()
    tmp.update([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col2 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col3 if token.pos_ == 'VERB'])

    piqa_tests.at[row.Index, 'verbs'] = tmp
    piqa_tests_verbs.update(tmp)

print(len(piqa_tests_verbs))

100%|██████████| 3084/3084 [00:39<00:00, 77.84it/s]

1403





### 1.2.4 Save Extracted Verbs

In [18]:
piqa_train_verbs_df = pd.DataFrame(piqa_train_verbs)
piqa_train_verbs_df.to_csv("./output/piqa_train_verbs.txt", index=False, header=False)

In [19]:
piqa_valid_verbs_df = pd.DataFrame(piqa_valid_verbs)
piqa_valid_verbs_df.to_csv("./output/piqa_valid_verbs.txt", index=False, header=False)

In [20]:
piqa_tests_verbs_df = pd.DataFrame(piqa_tests_verbs)
piqa_tests_verbs_df.to_csv("./output/piqa_tests_verbs.txt", index=False, header=False)

In [48]:
piqa_verbs = set()
piqa_verbs.update(piqa_train_verbs)
piqa_verbs.update(piqa_valid_verbs)
piqa_verbs.update(piqa_tests_verbs)

print(len(piqa_verbs))

3100


In [49]:
piqa_verbs_df = pd.DataFrame(piqa_verbs)
piqa_verbs_df.to_csv("./output/piqa_verbs.txt", index=False, header=False)

# 2. Collect ATOMIC which including PIQA-Verb

## 2.1 Open ATOMIC Dataset

In [38]:
colnames = ['head', 'relation', 'tail']
atomic_train = pd.read_csv('./atomic2020/train.tsv', sep='\t', names=colnames, header=None)
display(atomic_train.head())

Unnamed: 0,head,relation,tail
0,PersonX abandons ___ altogether,oEffect,none
1,PersonX abandons ___ altogether,oEffect,none
2,PersonX abandons ___ altogether,oReact,dejected
3,PersonX abandons ___ altogether,oWant,none
4,PersonX abandons ___ altogether,oWant,none


In [39]:
atomic_dev = pd.read_csv('./atomic2020/dev.tsv', sep='\t', names=colnames, header=None)
display(atomic_dev.head())

Unnamed: 0,head,relation,tail
0,PersonX 'd better go,oEffect,none
1,PersonX 'd better go,oEffect,none
2,PersonX 'd better go,oReact,none
3,PersonX 'd better go,oReact,none
4,PersonX 'd better go,oWant,none


In [40]:
atomic_test = pd.read_csv('./atomic2020/test.tsv', sep='\t', names=colnames, header=None)
display(atomic_test.head())

Unnamed: 0,head,relation,tail
0,PersonX abuses PersonX's power,oEffect,are told what to do
1,PersonX abuses PersonX's power,oEffect,given unfair consequences or punishment
2,PersonX abuses PersonX's power,oEffect,reach out for help
3,PersonX abuses PersonX's power,oEffect,none
4,PersonX abuses PersonX's power,oReact,humiliated


## 2.2 Load PIQA Verbs

In [50]:
piqa_verbs = set(line.strip() for line in open('./output/piqa_verbs.txt'))
print(len(piqa_verbs))

3100


## 2.3 Extract ATOMIC Verbs

In [42]:
atomic_train_length = len(atomic_train)
print(atomic_train_length)

1076880


In [43]:
atomic_dev_length = len(atomic_dev)
print(atomic_dev_length)

102024


In [44]:
atomic_test_length = len(atomic_test)
print(atomic_test_length)

152209


In [54]:
# check if tail is float
display(atomic_train[atomic_train['tail'].apply(lambda x: isinstance(x, float))])

Unnamed: 0,head,relation,tail,verbs_head,verbs_tail
6000,PersonX affects PersonY's health,oReact,,,
6018,PersonX affects PersonY's health,xReact,,,
6048,PersonX affects PersonY's life,xReact,,,
17115,PersonX asks PersonY to let,oReact,,,
36675,PersonX binds together the ___,xReact,,,
...,...,...,...,...,...
1063998,PersonX uses PersonX's internet,HinderedBy,,,
1064418,PersonX hears a knock,HinderedBy,,,
1064740,PersonX plays catch,HinderedBy,,,
1064868,PersonX shows it to PersonY's parents,HinderedBy,,,


In [68]:
nlp = spacy.load('en_core_web_sm')

atomic_train['verbs_head'] = None
atomic_train['verbs_tail'] = None
for row in tqdm(atomic_train.itertuples(), total=atomic_train_length):
    if (not isinstance(row.head, str)) or (not isinstance(row.tail, str)):
        continue

    col1 = nlp(row.head)
    col2 = nlp(row.tail)

    tmp1 = set([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp2 = set([token.lemma_ for token in col2 if token.pos_ == 'VERB'])

    atomic_train.at[row.Index, 'verbs_head'] = tmp1
    atomic_train.at[row.Index, 'verbs_tail'] = tmp2

display(atomic_train.head())

  1%|          | 8760/1076880 [00:46<1:34:11, 189.01it/s]


KeyboardInterrupt: 

In [59]:
nlp = spacy.load('en_core_web_sm')

atomic_dev['verbs_head'] = None
atomic_dev['verbs_tail'] = None
for row in tqdm(atomic_dev.itertuples(), total=atomic_dev_length):
    if (not isinstance(row.head, str)) or (not isinstance(row.tail, str)):
        continue
    
    col1 = nlp(row.head)
    col2 = nlp(row.tail)

    tmp1 = set([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp2 = set([token.lemma_ for token in col2 if token.pos_ == 'VERB'])

    atomic_dev.at[row.Index, 'verbs_head'] = tmp1
    atomic_dev.at[row.Index, 'verbs_tail'] = tmp2

100%|██████████| 102024/102024 [00:00<00:00, 1803848.11it/s]


In [60]:
nlp = spacy.load('en_core_web_sm')

atomic_test['verbs_head'] = None
atomic_test['verbs_tail'] = None
for row in tqdm(atomic_test.itertuples(), total=atomic_test_length):
    if (not isinstance(row.head, str)) or (not isinstance(row.tail, str)):
        continue
    
    col1 = nlp(row.head)
    col2 = nlp(row.tail)

    tmp1 = set([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp2 = set([token.lemma_ for token in col2 if token.pos_ == 'VERB'])

    atomic_test.at[row.Index, 'verbs_head'] = tmp1
    atomic_test.at[row.Index, 'verbs_tail'] = tmp2

100%|██████████| 152209/152209 [00:00<00:00, 1810914.71it/s]


In [62]:
atomic_train['match'] = None
for row in tqdm(atomic_train.itertuples(), total=atomic_train_length):
    if not isinstance(piqa_verbs, set):
        continue

    if (row.verbs_head & piqa_verbs) or (row.verbs_tail & piqa_verbs):
        atomic_train.at[row.Index, 'match'] = True
        
display(atomic_train[atomic_train['match'] == True].head())

  0%|          | 0/1076880 [00:00<?, ?it/s]


TypeError: unsupported operand type(s) for &: 'NoneType' and 'set'

In [None]:
atomic_dev['match'] = None
for row in tqdm(atomic_dev.itertuples(), total=atomic_length):
    if not isinstance(piqa_verbs, set):
        continue
    
    if (row.verbs_head & piqa_verbs) or (row.verbs_tail & piqa_verbs):
        atomic_dev.at[row.Index, 'match'] = True

display(atomic_dev[atomic_dev['match'] == True].head())

In [None]:
atomic_test['match'] = None
for row in tqdm(atomic_test.itertuples(), total=atomic_length):
    if not isinstance(piqa_verbs, set):
        continue
    
    if (row.verbs_head & piqa_verbs) or (row.verbs_tail & piqa_verbs):
        atomic_test.at[row.Index, 'match'] = True

display(atomic_test[atomic_test['match'] == True].head())

In [None]:
atomic_train[atomic_train['match'] == True][['head', 'relation', 'tail']].to_csv(
    "./output/atomic_train_match.tsv", sep='\t', index=False, header=False)

In [None]:
atomic_dev[atomic_dev['match'] == True][['head', 'relation', 'tail']].to_csv(
    "./output/atomic_dev_match.tsv", sep='\t', index=False, header=False)

In [None]:
atomic_test[atomic_test['match'] == True][['head', 'relation', 'tail']].to_csv(
    "./output/atomic_test_match.tsv", sep='\t', index=False, header=False)