# 1. Collect PIQA-Verb

In [2]:
import pandas as pd
import json

import spacy
from tqdm import tqdm

## 1.1 Load PIQA Dataset

### 1.1.1 Load PIQA - Goal and Solution

In [87]:
tmp = []
with open('./PIQA/train.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

with open('./PIQA/valid.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

piqa = pd.DataFrame(tmp)
display(piqa.head())


Unnamed: 0,goal,sol1,sol2
0,"When boiling butter, when it's ready, you can",Pour it onto a plate,Pour it into a jar
1,"To permanently attach metal legs to a chair, you can",Weld the metal together to get it to stay firmly in place,Nail the metal together to get it to stay firmly in place
2,how do you indent something?,leave a space before starting the writing,press the spacebar
3,how do you shake something?,move it up and down and side to side quickly.,stir it very quickly.
4,Clean tires,"Pour water, cape off caked on dirt. Use speed wool to clean out crevices and sparrow spaces.","Pour water, scrape off caked on dirt. Use a steel wool to clean out crevices and narrow spaces."


### 1.1.2 Load PIQA - Correct Solution

In [88]:
answer = []
with open('./PIQA/train-labels.lst') as f:
    answer += f.read().splitlines()

with open('./PIQA/valid-labels.lst') as f:
    answer += f.read().splitlines()

piqa['answer'] = answer # If answer is 0, sol1 is correct, else sol2 is correct
piqa['sol'] = piqa.apply(lambda x: x['sol1'] if x['answer'] == '0' else x['sol2'], axis=1)
piqa.drop(['sol1', 'sol2', 'answer'], axis=1, inplace=True)

display(piqa.head())

Unnamed: 0,goal,sol
0,"When boiling butter, when it's ready, you can",Pour it into a jar
1,"To permanently attach metal legs to a chair, you can",Weld the metal together to get it to stay firmly in place
2,how do you indent something?,leave a space before starting the writing
3,how do you shake something?,move it up and down and side to side quickly.
4,Clean tires,"Pour water, scrape off caked on dirt. Use a steel wool to clean out crevices and narrow spaces."


### 1.1.3 Merge Goal and Solution with Preprocessing

In [89]:
# Split the string by '.'.
# If not available, return the original string.
# Then capitalize the first letter of each sentences.
# Join the sentences by '. '.
def preprocess(text):
    return '. '.join([s.capitalize() for s in text.split('.') if s])
    
piqa['goal'] = piqa['goal'].apply(preprocess)
piqa['sol'] = piqa['sol'].apply(preprocess)

display(piqa.head())

Unnamed: 0,goal,sol
0,"When boiling butter, when it's ready, you can",Pour it into a jar
1,"To permanently attach metal legs to a chair, you can",Weld the metal together to get it to stay firmly in place
2,How do you indent something?,Leave a space before starting the writing
3,How do you shake something?,Move it up and down and side to side quickly
4,Clean tires,"Pour water, scrape off caked on dirt. use a steel wool to clean out crevices and narrow spaces"


In [90]:
# Merge goal and sol
piqa['text'] = piqa.apply(lambda x: 'Q: ' + x['goal'] + ' / A: ' + x['sol'], axis=1)

piqa.drop(['goal', 'sol'], axis=1, inplace=True)

pd.set_option('display.max_colwidth', -1)
display(piqa.head())

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,text
0,"Q: When boiling butter, when it's ready, you can / A: Pour it into a jar"
1,"Q: To permanently attach metal legs to a chair, you can / A: Weld the metal together to get it to stay firmly in place"
2,Q: How do you indent something? / A: Leave a space before starting the writing
3,Q: How do you shake something? / A: Move it up and down and side to side quickly
4,"Q: Clean tires / A: Pour water, scrape off caked on dirt. use a steel wool to clean out crevices and narrow spaces"


## 1.2 Collect PIQA-Verb

```$ python -m spacy download en_core_web_trf```

In [91]:
piqa_length = len(piqa)
print(piqa_length)

17951


### 1.2.1 Collect PIQA-Verb by ```en_core_web_trf``` in spaCy

In [92]:
nlp = spacy.load('en_core_web_trf')

piqa['verbs'] = None
piqa_verb = set()

for row in tqdm(piqa.itertuples(), total=piqa_length):
    col = nlp(row.text)
    
    tmp = set([token.lemma_ for token in col if token.pos_ == 'VERB'])
    
    piqa.at[row.Index, 'verbs'] = tmp
    piqa_verb.update(tmp)

print(len(piqa_verb))

100%|██████████| 17951/17951 [15:18<00:00, 19.54it/s]

2389





In [95]:
piqa_df = pd.DataFrame(piqa)
piqa_df.to_csv('./outputs/PIQA-Verb/PIQA.csv', index=False)

OSError: Cannot save file into a non-existent directory: 'PIQA-Verb'

### 1.2.2 Save PIQA-Verb by version

In [94]:
piqa_verb_df = pd.DataFrame(piqa_verb)
piqa_verb_df.to_csv("./outputs/PIQA-Verb/PIQA-Verb.txt", index=False, header=False)

print(len(piqa_verb_df))

2389
