# 1. Extract Verb in PIQA Dataset

In [None]:
import pandas as pd
import json

import spacy
from tqdm import tqdm

## 1.1 Open PIQA Dataset

In [None]:
tmp = []
with open('PIQA/train.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

piqa_train = pd.DataFrame(tmp)
display(piqa_train.head())

Unnamed: 0,goal,sol1,sol2
0,"When boiling butter, when it's ready, you can",Pour it onto a plate,Pour it into a jar
1,"To permanently attach metal legs to a chair, y...",Weld the metal together to get it to stay firm...,Nail the metal together to get it to stay firm...
2,how do you indent something?,leave a space before starting the writing,press the spacebar
3,how do you shake something?,move it up and down and side to side quickly.,stir it very quickly.
4,Clean tires,"Pour water, cape off caked on dirt. Use speed...","Pour water, scrape off caked on dirt. Use a st..."


## 1.2 Extract PIQA Verbs

```$ python -m spacy download en_core_web_sm```

```$ python -m spacy download en_core_web_trf```

In [None]:
piqa_length = len(piqa_train)
print(piqa_length)

16113


In [40]:
nlp = spacy.load('en_core_web_sm')

piqa_verbs_sm = set()
for row in tqdm(piqa_train.itertuples(), total=piqa_length):
    col1 = nlp(row.goal)
    col2 = nlp(row.sol1)
    col3 = nlp(row.sol2)
    # col = nlp(row.goal + ' ' + row.sol1 + ' ' + row.sol2)
    
    piqa_verbs_sm.update([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    piqa_verbs_sm.update([token.lemma_ for token in col2 if token.pos_ == 'VERB'])
    piqa_verbs_sm.update([token.lemma_ for token in col3 if token.pos_ == 'VERB'])
    

100%|██████████| 16113/16113 [03:38<00:00, 73.86it/s]


In [41]:
print(len(piqa_verbs_sm))

2802


In [42]:
# import spacy
# from tqdm import tqdm

# nlp = spacy.load('en_core_web_trf')

# piqa_verbs_trf = set()
# for row in tqdm(piqa_train.itertuples(), total=piqa_length):
#     col1 = nlp(row.goal)
#     col2 = nlp(row.sol1)
#     col3 = nlp(row.sol2)

#     piqa_verbs_trf.update([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
#     piqa_verbs_trf.update([token.lemma_ for token in col2 if token.pos_ == 'VERB'])
#     piqa_verbs_trf.update([token.lemma_ for token in col3 if token.pos_ == 'VERB'])

## 1.2.1 Compare ```en_core_web_sm``` and ```en_core_web_trf```

In [57]:
for i in tqdm(range(len(piqa_verbs_sm))):
    print(piqa_verbs_sm['verbs'][i] ^ piqa_verbs_trf['verbs'][i])

TypeError: 'set' object is not subscriptable

# 2. Collect ATOMIC which including PIQA-Verb

## 2.1 Open ATOMIC Dataset

In [43]:
colnames = ['head', 'relation', 'tail']
atomic_train = pd.read_csv('atomic2020/train.tsv', sep='\t', names=colnames, header=None)
display(atomic_train.head())

Unnamed: 0,head,relation,tail
0,PersonX abandons ___ altogether,oEffect,none
1,PersonX abandons ___ altogether,oEffect,none
2,PersonX abandons ___ altogether,oReact,dejected
3,PersonX abandons ___ altogether,oWant,none
4,PersonX abandons ___ altogether,oWant,none


## 2.2 Extract ATOMIC Verbs

In [44]:
atomic_length = len(atomic_train)
print(atomic_length)

1076880


In [49]:
nlp = spacy.load('en_core_web_sm')

atomic_train['verbs'] = None
for row in tqdm(atomic_train.itertuples(), total=atomic_length):
    col1 = nlp(row.head)
    col2 = nlp(row.tail)
    # col = nlp(row.head + ' ' + row.tail)

    tmp = set()
    tmp.update([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col2 if token.pos_ == 'VERB'])

    atomic_train.at[row.Index, 'verbs'] = tmp

  0%|          | 1423/1076880 [00:07<1:35:55, 186.86it/s]


KeyboardInterrupt: 

In [50]:
atomic_train['match'] = None
for row in tqdm(atomic_train.itertuples(), total=atomic_length):
    if row.verbs & piqa_verbs_sm:
        atomic_train.at[row.Index, 'match'] = True

# query 사용 방법 알아내기

  0%|          | 1423/1076880 [00:00<00:04, 215282.59it/s]


TypeError: unsupported operand type(s) for &: 'NoneType' and 'set'

In [53]:
display(atomic_train[atomic_train['match'] == True].head())

Unnamed: 0,head,relation,tail,verbs,match
5,PersonX abandons ___ altogether,oWant,to find a new job for him,{find},True
6,PersonX abandons ___ altogether,oWant,to support him,{support},True
13,PersonX abandons ___ altogether,xEffect,gets a reputation as a quitter,{get},True
14,PersonX abandons ___ altogether,xEffect,hangs head in shame,{hang},True
15,PersonX abandons ___ altogether,xEffect,Begins the process of change,{begin},True
