# 3. Collect ATOMIC-Extract

In [None]:
import pandas as pd
import json

import spacy
from tqdm import tqdm

## 3.1 Load ATOMIC Dataset

In [None]:
colnames = ['head', 'relation', 'tail']
atomic_train = pd.read_csv('./atomic2020/train.tsv', sep='\t', names=colnames, header=None)
display(atomic_train.head())

In [None]:
atomic_dev = pd.read_csv('./atomic2020/dev.tsv', sep='\t', names=colnames, header=None)
display(atomic_dev.head())

In [None]:
atomic_test = pd.read_csv('./atomic2020/test.tsv', sep='\t', names=colnames, header=None)
display(atomic_test.head())

## 3.2 Load PIQA-Verb

In [None]:
piqa_verbs = set(line.strip() for line in open('./outputs/PIQA-Verb/all.txt'))
print(len(piqa_verbs))

## 3.3 Collect ATOMIC-Extract

### 3.3.1 Find ATOMIC Verbs by ```en_core_web_trf``` in spaCy

In [None]:
atomic_train_length = len(atomic_train)
print(atomic_train_length)

In [None]:
atomic_dev_length = len(atomic_dev)
print(atomic_dev_length)

In [None]:
atomic_test_length = len(atomic_test)
print(atomic_test_length)

In [None]:
# check if tail is float
display(atomic_train[atomic_train['tail'].apply(lambda x: isinstance(x, float))])

In [None]:
nlp = spacy.load('en_core_web_trf')

atomic_train['verbs_head'] = None
atomic_train['verbs_tail'] = None
for row in tqdm(atomic_train.itertuples(), total=atomic_train_length):
    if (not isinstance(row.head, str)) or (not isinstance(row.tail, str)):
        continue

    col1 = nlp(row.head)
    col2 = nlp(row.tail)

    tmp1 = set([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp2 = set([token.lemma_ for token in col2 if token.pos_ == 'VERB'])

    atomic_train.at[row.Index, 'verbs_head'] = tmp1
    atomic_train.at[row.Index, 'verbs_tail'] = tmp2

display(atomic_train.head())

In [None]:
atomic_dev['verbs_head'] = None
atomic_dev['verbs_tail'] = None
for row in tqdm(atomic_dev.itertuples(), total=atomic_dev_length):
    if (not isinstance(row.head, str)) or (not isinstance(row.tail, str)):
        continue
    
    col1 = nlp(row.head)
    col2 = nlp(row.tail)

    tmp1 = set([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp2 = set([token.lemma_ for token in col2 if token.pos_ == 'VERB'])

    atomic_dev.at[row.Index, 'verbs_head'] = tmp1
    atomic_dev.at[row.Index, 'verbs_tail'] = tmp2

In [None]:
atomic_test['verbs_head'] = None
atomic_test['verbs_tail'] = None
for row in tqdm(atomic_test.itertuples(), total=atomic_test_length):
    if (not isinstance(row.head, str)) or (not isinstance(row.tail, str)):
        continue
    
    col1 = nlp(row.head)
    col2 = nlp(row.tail)

    tmp1 = set([token.lemma_ for token in col1 if token.pos_ == 'VERB'])
    tmp2 = set([token.lemma_ for token in col2 if token.pos_ == 'VERB'])

    atomic_test.at[row.Index, 'verbs_head'] = tmp1
    atomic_test.at[row.Index, 'verbs_tail'] = tmp2

### 3.3.2 Find ATOMIC Data which including PIQA-Verb

In [None]:
atomic_train['match'] = None
for row in tqdm(atomic_train.itertuples(), total=atomic_train_length):
    if (not isinstance(row.verbs_head, set)) or (not isinstance(row.verbs_tail, set)) or (not isinstance(piqa_verbs, set)):
        continue

    if (row.verbs_head & piqa_verbs) or (row.verbs_tail & piqa_verbs):
        atomic_train.at[row.Index, 'match'] = True
    
    else:
        atomic_train.at[row.Index, 'match'] = False
        
display(atomic_train[atomic_train['match'] == True].head())
print(len(atomic_train[atomic_train['match'] == True]))
print(len(atomic_train[atomic_train['match'] == False]))

In [None]:
atomic_dev['match'] = None
for row in tqdm(atomic_dev.itertuples(), total=atomic_dev_length):
    if (not isinstance(row.verbs_head, set)) or (not isinstance(row.verbs_tail, set)) or (not isinstance(piqa_verbs, set)):
        continue
    
    if (row.verbs_head & piqa_verbs) or (row.verbs_tail & piqa_verbs):
        atomic_dev.at[row.Index, 'match'] = True
    
    else:
        atomic_dev.at[row.Index, 'match'] = False

display(atomic_dev[atomic_dev['match'] == True].head())
print(len(atomic_dev[atomic_dev['match'] == True]))
print(len(atomic_dev[atomic_dev['match'] == False]))

In [None]:
atomic_test['match'] = None
for row in tqdm(atomic_test.itertuples(), total=atomic_test_length):
    if (not isinstance(row.verbs_head, set)) or (not isinstance(row.verbs_tail, set)) or (not isinstance(piqa_verbs, set)):
        continue
    
    if (row.verbs_head & piqa_verbs) or (row.verbs_tail & piqa_verbs):
        atomic_test.at[row.Index, 'match'] = True
    
    else:
        atomic_test.at[row.Index, 'match'] = False

display(atomic_test[atomic_test['match'] == True].head())
print(len(atomic_test[atomic_test['match'] == True]))
print(len(atomic_test[atomic_test['match'] == False]))

### 3.3.3 Save ATOMIC-Extract

In [None]:
atomic_train[atomic_train['match'] == True][['head', 'relation', 'tail']].to_csv(
    "./outputs/ATOMIC-Extract/train.tsv", sep='\t', index=False, header=False)

In [None]:
atomic_dev[atomic_dev['match'] == True][['head', 'relation', 'tail']].to_csv(
    "./outputs/ATOMIC-Extract/dev.tsv", sep='\t', index=False, header=False)

In [None]:
atomic_test[atomic_test['match'] == True][['head', 'relation', 'tail']].to_csv(
    "./outputs/ATOMIC-Extract/test.tsv", sep='\t', index=False, header=False)

### 3.3.4 Save ATOMIC-Extract-Full

In [None]:
atomic_train[atomic_train['match'] == True][['head', 'relation', 'tail', 'verbs_head', 'verbs_tail']].to_csv(
    "./outputs/ATOMIC-Extract/train_full.tsv", sep='\t', index=False, header=False)

In [None]:
atomic_dev[atomic_dev['match'] == True][['head', 'relation', 'tail', 'verbs_head', 'verbs_tail']].to_csv(
    "./outputs/ATOMIC-Extract/dev_full.tsv", sep='\t', index=False, header=False)

In [None]:
atomic_test[atomic_test['match'] == True][['head', 'relation', 'tail', 'verbs_head', 'verbs_tail']].to_csv(
    "./outputs/ATOMIC-Extract/test_full.tsv", sep='\t', index=False, header=False)