# 1. Collect PIQA-Verb

In [None]:
import pandas as pd
import json

import spacy
from tqdm import tqdm

## 1.1 Load PIQA Dataset

In [None]:
tmp = []
with open('./PIQA/train.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

piqa_train = pd.DataFrame(tmp)
display(piqa_train.head())

In [None]:
tmp = []
with open('./PIQA/valid.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

piqa_valid = pd.DataFrame(tmp)
display(piqa_valid.head())

In [None]:
tmp = []
with open('./PIQA/tests.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

piqa_tests = pd.DataFrame(tmp)
display(piqa_tests.head())

## 1.2 Collect PIQA-Verb

```$ python -m spacy download en_core_web_trf```

In [None]:
piqa_train_length = len(piqa_train)
print(piqa_train_length)

In [None]:
piqa_valid_length = len(piqa_valid)
print(piqa_valid_length)

In [None]:
piqa_tests_length = len(piqa_tests)
print(piqa_tests_length)

### 1.2.1 Collect PIQA-Verb by ```en_core_web_trf``` in spaCy

In [None]:
nlp = spacy.load('en_core_web_trf')

piqa_train['verbs'] = None
piqa_verb_train = set()

for row in tqdm(piqa_train.itertuples(), total=piqa_train_length):
    col1 = nlp(row.goal)
    col2 = nlp(row.sol1)
    col3 = nlp(row.sol2)
    
    tmp = set()
    tmp.update([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col2 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col3 if token.pos_ == 'VERB'])

    piqa_train.at[row.Index, 'verbs'] = tmp
    piqa_verb_train.update(tmp)

print(len(piqa_verb_train))

In [None]:
piqa_valid['verbs'] = None
piqa_verb_valid = set()

for row in tqdm(piqa_valid.itertuples(), total=piqa_valid_length):
    col1 = nlp(row.goal)
    col2 = nlp(row.sol1)
    col3 = nlp(row.sol2)
    
    tmp = set()
    tmp.update([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col2 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col3 if token.pos_ == 'VERB'])

    piqa_valid.at[row.Index, 'verbs'] = tmp
    piqa_verb_valid.update(tmp)

print(len(piqa_verb_valid))

In [None]:
piqa_tests['verbs'] = None
piqa_verb_tests = set()

for row in tqdm(piqa_tests.itertuples(), total=piqa_tests_length):
    col1 = nlp(row.goal)
    col2 = nlp(row.sol1)
    col3 = nlp(row.sol2)
    
    tmp = set()
    tmp.update([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col2 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col3 if token.pos_ == 'VERB'])

    piqa_tests.at[row.Index, 'verbs'] = tmp
    piqa_verb_tests.update(tmp)

print(len(piqa_verb_tests))

### 1.2.2 Save PIQA-Verb by version

In [None]:
piqa_verb_train_df = pd.DataFrame(piqa_verb_train)
piqa_verb_train_df.to_csv("./outputs/PIQA-Verb/train.txt", index=False, header=False)

In [None]:
piqa_verb_valid_df = pd.DataFrame(piqa_valid_verbs)
piqa_verb_valid_df.to_csv("./outputs/PIQA-Verb/valid.txt", index=False, header=False)

In [None]:
piqa_verb_tests_df = pd.DataFrame(piqa_verb_tests)
piqa_verb_tests_df.to_csv("./outputs/PIQA-Verb/tests.txt", index=False, header=False)

### 1.2.3 Save PIQA-Verb Full Version

In [None]:
piqa_verb = set()
piqa_verb.update(piqa_verb_train)
piqa_verb.update(piqa_verb_valid)
piqa_verb.update(piqa_verb_tests)

print(len(piqa_verb))

In [None]:
piqa_verbs_df = pd.DataFrame(piqa_verb)
piqa_verbs_df.to_csv("./outputs/PIQA-Verb/all.txt", index=False, header=False)