# 1. Collect PIQA-Verb

In [None]:
import pandas as pd
import json

import spacy
from tqdm import tqdm

## 1.1 Load PIQA Dataset

### 1.1.1 Load PIQA - Goal and Solution

In [None]:
tmp = []
with open('./PIQA/train.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

with open('./PIQA/valid.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

piqa = pd.DataFrame(tmp)
display(piqa.head())


### 1.1.2 Load PIQA - Correct Solution

In [None]:
answer = []
with open('./PIQA/train-labels.lst') as f:
    answer += f.read().splitlines()

with open('./PIQA/valid-labels.lst') as f:
    answer += f.read().splitlines()

piqa['answer'] = answer # If answer is 0, sol1 is correct, else sol2 is correct
piqa['sol'] = piqa.apply(lambda x: x['sol1'] if x['answer'] == '0' else x['sol2'], axis=1)
piqa.drop(['sol1', 'sol2', 'answer'], axis=1, inplace=True)

display(piqa.head())

### 1.1.3 Merge Goal and Solution with Preprocessing

In [None]:
# Split the string by '.'.
# If not available, return the original string.
# Then capitalize the first letter of each sentences.
# Join the sentences by '. '.
def preprocess(text):
    return '. '.join([s.capitalize() for s in text.split('.') if s])
    
piqa['goal'] = piqa['goal'].apply(preprocess)
piqa['sol'] = piqa['sol'].apply(preprocess)

display(piqa.head())

In [None]:
# Merge goal and sol
piqa['text'] = piqa.apply(lambda x: 'Q: ' + x['goal'] + ' / A: ' + x['sol'], axis=1)

piqa.drop(['goal', 'sol'], axis=1, inplace=True)

pd.set_option('display.max_colwidth', -1)
display(piqa.head())

In [None]:
display(piqa.sample(30))

## 1.2 Collect PIQA-Verb

```$ python -m spacy download en_core_web_trf```

In [None]:
piqa_length = len(piqa)
print(piqa_length)

### 1.2.1 Collect PIQA-Verb by ```en_core_web_trf``` in spaCy

In [None]:
nlp = spacy.load('en_core_web_trf')

piqa['verbs'] = None
piqa_verb = set()

for row in tqdm(piqa.itertuples(), total=piqa_length):
    col = nlp(row.text)
    
    tmp = set([token.lemma_ for token in col if token.pos_ == 'VERB'])
    
    piqa.at[row.Index, 'verbs'] = tmp
    piqa_verb.update(tmp)

print(len(piqa_verb))

In [None]:
piqa_df = pd.DataFrame(piqa)
piqa_df.to_csv('./outputs/PIQA-Verb/PIQA.csv', index=False)

### 1.2.2 Save PIQA-Verb by version

In [None]:
piqa_verb_df = pd.DataFrame(piqa_verb)
piqa_verb_df.to_csv("./outputs/PIQA-Verb/PIQA-Verb.txt", index=False, header=False)

print(len(piqa_verb_df))