# 1. Extract Verb in PIQA Dataset

In [None]:
import pandas as pd
import json

import spacy
from tqdm import tqdm

## 1.1 Open PIQA Dataset

In [None]:
tmp = []
with open('./PIQA/train.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

piqa_train = pd.DataFrame(tmp)
display(piqa_train.head())

In [None]:
tmp = []
with open('./PIQA/valid.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

piqa_valid = pd.DataFrame(tmp)
display(piqa_valid.head())

In [None]:
tmp = []
with open('./PIQA/tests.jsonl') as f:
    for obj in f:
        tmp.append(json.loads(obj))

piqa_tests = pd.DataFrame(tmp)
display(piqa_tests.head())

## 1.2 Extract PIQA Verbs

```$ python -m spacy download en_core_web_sm```

```$ python -m spacy download en_core_web_trf```

In [None]:
piqa_train_length = len(piqa_train)
print(piqa_train_length)

In [None]:
piqa_valid_length = len(piqa_valid)
print(piqa_valid_length)

In [None]:
piqa_tests_length = len(piqa_tests)
print(piqa_tests_length)

### 1.2.1 Extract Verbs by ```en_core_web_sm```

In [None]:
nlp = spacy.load('en_core_web_sm')

piqa_train['verbs'] = None
piqa_train_verbs = set()

for row in tqdm(piqa_train.itertuples(), total=piqa_train_length):
    col1 = nlp(row.goal)
    col2 = nlp(row.sol1)
    col3 = nlp(row.sol2)
    
    tmp = set()
    tmp.update([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col2 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col3 if token.pos_ == 'VERB'])

    piqa_train.at[row.Index, 'verbs'] = tmp
    piqa_train_verbs.update(tmp)

print(len(piqa_train_verbs))

In [None]:
piqa_valid['verbs'] = None
piqa_valid_verbs = set()

for row in tqdm(piqa_valid.itertuples(), total=piqa_valid_length):
    col1 = nlp(row.goal)
    col2 = nlp(row.sol1)
    col3 = nlp(row.sol2)
    
    tmp = set()
    tmp.update([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col2 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col3 if token.pos_ == 'VERB'])

    piqa_valid.at[row.Index, 'verbs'] = tmp
    piqa_valid_verbs.update(tmp)

print(len(piqa_valid_verbs))

In [None]:
piqa_tests['verbs'] = None
piqa_tests_verbs = set()

for row in tqdm(piqa_tests.itertuples(), total=piqa_tests_length):
    col1 = nlp(row.goal)
    col2 = nlp(row.sol1)
    col3 = nlp(row.sol2)
    
    tmp = set()
    tmp.update([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col2 if token.pos_ == 'VERB'])
    tmp.update([token.lemma_ for token in col3 if token.pos_ == 'VERB'])

    piqa_tests.at[row.Index, 'verbs'] = tmp
    piqa_tests_verbs.update(tmp)

print(len(piqa_tests_verbs))

### 1.2.4 Save Extracted Verbs

In [None]:
piqa_train_verbs_df = pd.DataFrame(piqa_train_verbs)
piqa_train_verbs_df.to_csv("./output/piqa_train_verbs.txt", index=False, header=False)

In [None]:
piqa_valid_verbs_df = pd.DataFrame(piqa_valid_verbs)
piqa_valid_verbs_df.to_csv("./output/piqa_valid_verbs.txt", index=False, header=False)

In [None]:
piqa_tests_verbs_df = pd.DataFrame(piqa_tests_verbs)
piqa_tests_verbs_df.to_csv("./output/piqa_tests_verbs.txt", index=False, header=False)

In [None]:
piqa_verbs = set()
piqa_verbs.update(piqa_train_verbs)
piqa_verbs.update(piqa_valid_verbs)
piqa_verbs.update(piqa_tests_verbs)

print(len(piqa_verbs))

In [None]:
piqa_verbs_df = pd.DataFrame(piqa_verbs)
piqa_verbs_df.to_csv("./output/piqa_verbs.txt", index=False, header=False)

# 2. Collect ATOMIC which including PIQA-Verb

## 2.1 Open ATOMIC Dataset

In [None]:
colnames = ['head', 'relation', 'tail']
atomic_train = pd.read_csv('./atomic2020/train.tsv', sep='\t', names=colnames, header=None)
display(atomic_train.head())

In [None]:
atomic_dev = pd.read_csv('./atomic2020/dev.tsv', sep='\t', names=colnames, header=None)
display(atomic_dev.head())

In [None]:
atomic_test = pd.read_csv('./atomic2020/test.tsv', sep='\t', names=colnames, header=None)
display(atomic_test.head())

## 2.2 Load PIQA Verbs

In [None]:
piqa_verbs = set(line.strip() for line in open('./output/piqa_verbs.txt'))
print(len(piqa_verbs))

## 2.3 Extract ATOMIC Verbs

In [None]:
atomic_train_length = len(atomic_train)
print(atomic_train_length)

In [None]:
atomic_dev_length = len(atomic_dev)
print(atomic_dev_length)

In [None]:
atomic_test_length = len(atomic_test)
print(atomic_test_length)

In [None]:
# check if tail is float
display(atomic_train[atomic_train['tail'].apply(lambda x: isinstance(x, float))])

In [None]:
nlp = spacy.load('en_core_web_sm')

atomic_train['verbs_head'] = None
atomic_train['verbs_tail'] = None
for row in tqdm(atomic_train.itertuples(), total=atomic_train_length):
    if (not isinstance(row.head, str)) or (not isinstance(row.tail, str)):
        continue

    col1 = nlp(row.head.replace('PersonX', 'Person').replace('PersonY', 'Person'))
    col2 = nlp(row.tail)

    tmp1 = set([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp2 = set([token.lemma_ for token in col2 if token.pos_ == 'VERB'])

    atomic_train.at[row.Index, 'verbs_head'] = tmp1
    atomic_train.at[row.Index, 'verbs_tail'] = tmp2

display(atomic_train.head())

In [None]:
nlp = spacy.load('en_core_web_sm')

atomic_dev['verbs_head'] = None
atomic_dev['verbs_tail'] = None
for row in tqdm(atomic_dev.itertuples(), total=atomic_dev_length):
    if (not isinstance(row.head, str)) or (not isinstance(row.tail, str)):
        continue
    
    col1 = nlp(row.head.replace('PersonX', 'Person').replace('PersonY', 'Person'))
    col2 = nlp(row.tail)

    tmp1 = set([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp2 = set([token.lemma_ for token in col2 if token.pos_ == 'VERB'])

    atomic_dev.at[row.Index, 'verbs_head'] = tmp1
    atomic_dev.at[row.Index, 'verbs_tail'] = tmp2

In [None]:
nlp = spacy.load('en_core_web_sm')

atomic_test['verbs_head'] = None
atomic_test['verbs_tail'] = None
for row in tqdm(atomic_test.itertuples(), total=atomic_test_length):
    if (not isinstance(row.head, str)) or (not isinstance(row.tail, str)):
        continue
    
    col1 = nlp(row.head.replace('PersonX', 'Person').replace('PersonY', 'Person'))
    col2 = nlp(row.tail)

    tmp1 = set([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp2 = set([token.lemma_ for token in col2 if token.pos_ == 'VERB'])

    atomic_test.at[row.Index, 'verbs_head'] = tmp1
    atomic_test.at[row.Index, 'verbs_tail'] = tmp2

In [None]:
atomic_train['match'] = None
for row in tqdm(atomic_train.itertuples(), total=atomic_train_length):
    if (not isinstance(row.verbs_head, set)) or (not isinstance(row.verbs_tail, set)) or (not isinstance(piqa_verbs, set)):
        continue

    if (row.verbs_head & piqa_verbs) or (row.verbs_tail & piqa_verbs):
        atomic_train.at[row.Index, 'match'] = True
    
    else:
        atomic_train.at[row.Index, 'match'] = False
        
display(atomic_train[atomic_train['match'] == True].head())
print(len(atomic_train[atomic_train['match'] == True]))
print(len(atomic_train[atomic_train['match'] == False]))

In [None]:
atomic_dev['match'] = None
for row in tqdm(atomic_dev.itertuples(), total=atomic_dev_length):
    if (not isinstance(row.verbs_head, set)) or (not isinstance(row.verbs_tail, set)) or (not isinstance(piqa_verbs, set)):
        continue
    
    if (row.verbs_head & piqa_verbs) or (row.verbs_tail & piqa_verbs):
        atomic_dev.at[row.Index, 'match'] = True
    
    else:
        atomic_dev.at[row.Index, 'match'] = False

display(atomic_dev[atomic_dev['match'] == True].head())
print(len(atomic_dev[atomic_dev['match'] == True]))
print(len(atomic_dev[atomic_dev['match'] == False]))

In [None]:
atomic_test['match'] = None
for row in tqdm(atomic_test.itertuples(), total=atomic_test_length):
    if (not isinstance(row.verbs_head, set)) or (not isinstance(row.verbs_tail, set)) or (not isinstance(piqa_verbs, set)):
        continue
    
    if (row.verbs_head & piqa_verbs) or (row.verbs_tail & piqa_verbs):
        atomic_test.at[row.Index, 'match'] = True
    
    else:
        atomic_test.at[row.Index, 'match'] = False

display(atomic_test[atomic_test['match'] == True].head())
print(len(atomic_test[atomic_test['match'] == True]))
print(len(atomic_test[atomic_test['match'] == False]))

In [None]:
atomic_train[atomic_train['match'] == True][['head', 'relation', 'tail']].to_csv(
    "./output/atomic_train_match.tsv", sep='\t', index=False, header=False)

In [None]:
atomic_dev[atomic_dev['match'] == True][['head', 'relation', 'tail']].to_csv(
    "./output/atomic_dev_match.tsv", sep='\t', index=False, header=False)

In [None]:
atomic_test[atomic_test['match'] == True][['head', 'relation', 'tail']].to_csv(
    "./output/atomic_test_match.tsv", sep='\t', index=False, header=False)

In [None]:
atomic_train[atomic_train['match'] == True].to_csv("./output/atomic_train_match_full.tsv", sep='\t', index=False, header=False)

In [None]:
atomic_dev[atomic_dev['match'] == True].to_csv("./output/atomic_dev_match_full.tsv", sep='\t', index=False, header=False)

In [None]:
atomic_test[atomic_test['match'] == True].to_csv("./output/atomic_test_match_full.tsv", sep='\t', index=False, header=False)