In [48]:
import json
import string
from collections import Counter

import pandas as pd
import nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english') + list(string.punctuation))

import tokenization
tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
allowed_pos = ['JJ', 'NN', 'VB', 'RB']

In [92]:
train_data = pd.read_json('../piqa_data/train.jsonl', orient='records', lines=True)
train_data.head()

Unnamed: 0,goal,sol1,sol2
0,"When boiling butter, when it's ready, you can",Pour it onto a plate,Pour it into a jar
1,"To permanently attach metal legs to a chair, y...",Weld the metal together to get it to stay firm...,Nail the metal together to get it to stay firm...
2,how do you indent something?,leave a space before starting the writing,press the spacebar
3,how do you shake something?,move it up and down and side to side quickly.,stir it very quickly.
4,Clean tires,"Pour water, cape off caked on dirt. Use speed...","Pour water, scrape off caked on dirt. Use a st..."


In [30]:
# load wikihow
wikihow = pd.read_csv("../../data/wikihow/wikihowNoText.csv", index_col=0).dropna()
wikihow['content'] = wikihow.apply(lambda r: r['title'] + r['headline'], axis=1)
wikihow.head()

Unnamed: 0_level_0,title,headline,content
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,How to Be an Organized Artist1,"\nKeep related supplies in the same area.,\nMa...",How to Be an Organized Artist1\nKeep related s...
1,How to Create a Neopoprealist Art Work,\nCreate a sketch in the NeoPopRealist manner ...,How to Create a Neopoprealist Art Work\nCreate...
2,How to Be a Visual Effects Artist1,"\nGet a bachelor’s degree.,\nEnroll in a studi...",How to Be a Visual Effects Artist1\nGet a bach...
3,How to Become an Art Investor,\nStart with some experience or interest in ar...,How to Become an Art Investor\nStart with some...
4,How to Be an Organized Artist2,"\nKeep your reference materials, sketches, art...",How to Be an Organized Artist2\nKeep your refe...


In [85]:
def sent2tokens(s, tokenizer, stopwords):
    tokens = tokenizer.tokenize(s)
    tokens = [w for w in tokens if w not in stopwords]
    return set(tokens)

In [95]:
train_data['goal_tokens'] = train_data['goal'].apply(lambda s: sent2tokens(s, tokenizer, stopwords))
train_data['goal_sol1_tokens'] = train_data.apply(
    lambda r: r['goal_tokens'].union(sent2tokens(r['sol1'], tokenizer, stopwords)), axis=1)
train_data['goal_sol2_tokens'] = train_data.apply(
    lambda r: r['goal_tokens'].union(sent2tokens(r['sol2'], tokenizer, stopwords)), axis=1)
train_data.head()

Unnamed: 0,goal,sol1,sol2,goal_tokens,goal_sol1_tokens,goal_sol2_tokens
0,"When boiling butter, when it's ready, you can",Pour it onto a plate,Pour it into a jar,"{ready, boiling, butter}","{onto, butter, pour, plate, boiling, ready}","{jar, butter, pour, boiling, ready}"
1,"To permanently attach metal legs to a chair, y...",Weld the metal together to get it to stay firm...,Nail the metal together to get it to stay firm...,"{metal, permanently, legs, attach, chair}","{metal, weld, place, stay, firmly, permanently...","{metal, nail, place, stay, firmly, permanently..."
2,how do you indent something?,leave a space before starting the writing,press the spacebar,"{indent, something}","{starting, indent, leave, space, writing, some...","{spacebar, press, indent, something}"
3,how do you shake something?,move it up and down and side to side quickly.,stir it very quickly.,"{shake, something}","{quickly, shake, move, side, something}","{shake, quickly, stir, something}"
4,Clean tires,"Pour water, cape off caked on dirt. Use speed...","Pour water, scrape off caked on dirt. Use a st...","{clean, tires}","{caked, pour, use, water, speed, wool, crevice...","{caked, pour, use, water, wool, crevices, stee..."


In [57]:
wikihow['content_tokens'] = wikihow['content'].apply(lambda s: sent2tokens(s, tokenizer, stopwords))
wikihow.head()

Unnamed: 0_level_0,title,headline,content,content_tokens
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,How to Be an Organized Artist1,"\nKeep related supplies in the same area.,\nMa...",How to Be an Organized Artist1\nKeep related s...,"{ideas, use, inch, place, supplies, permanent,..."
1,How to Create a Neopoprealist Art Work,\nCreate a sketch in the NeoPopRealist manner ...,How to Create a Neopoprealist Art Work\nCreate...,"{images, section, coat, primed, background, ve..."
2,How to Be a Visual Effects Artist1,"\nGet a bachelor’s degree.,\nEnroll in a studi...",How to Be a Visual Effects Artist1\nGet a bach...,"{bachelor, attention, degree, ’, movies, onlin..."
3,How to Become an Art Investor,\nStart with some experience or interest in ar...,How to Become an Art Investor\nStart with some...,"{fine, rather, houses, become, price, interest..."
4,How to Be an Organized Artist2,"\nKeep your reference materials, sketches, art...",How to Be an Organized Artist2\nKeep your refe...,"{necessary, place, supplies, organize, list, e..."


In [107]:
def get_top_overlaps(tokens, threshold=0.2):
    overlaps = wikihow['content_tokens'].apply(lambda x: len(tokens.intersection(x)) / len(x))
    over_threshold = overlaps[overlaps > threshold]
#     print(over_threshold.shape)
    top5 = over_threshold.sort_values(ascending=False)[:5]
    return top5.index.tolist()

In [109]:
train_data['sol1_knowledge_idx'] = train_data['goal_sol1_tokens'].apply(get_top_overlaps)

In [108]:
import time
a = time.time()
train_data['sol2_knowledge_idx'] = train_data['goal_sol2_tokens'].apply(get_top_overlaps)
b = time.time()
print(round(b-a, 2), 'sec')

3008.37 sec


In [112]:
train_data['sol1_best_knowledge'] = train_data['sol1_knowledge_idx'].apply(
    lambda l: wikihow.loc[l[0]]['content'] if l else "")

In [113]:
train_data['sol2_best_knowledge'] = train_data['sol2_knowledge_idx'].apply(
    lambda l: wikihow.loc[l[0]]['content'] if l else "")
train_data.head()

Unnamed: 0,goal,sol1,sol2,goal_tokens,goal_sol1_tokens,goal_sol2_tokens,sol1_knowledge_idx,sol2_knowledge_idx,sol1_best_knowledge,sol2_best_knowledge
0,"When boiling butter, when it's ready, you can",Pour it onto a plate,Pour it into a jar,"{ready, boiling, butter}","{onto, butter, pour, plate, boiling, ready}","{jar, butter, pour, boiling, ready}","[196943, 168565, 158230, 169090, 131919]","[131755, 125889, 196943, 188145]",How to Unclog a Drain with Salt and Vinegar3\n...,How to Make Irish Butter\nPour the Irish cream...
1,"To permanently attach metal legs to a chair, y...",Weld the metal together to get it to stay firm...,Nail the metal together to get it to stay firm...,"{metal, permanently, legs, attach, chair}","{metal, weld, place, stay, firmly, permanently...","{metal, nail, place, stay, firmly, permanently...","[49690, 176875, 185137, 53962, 205920]","[49690, 53962, 185137, 176875, 29058]",How to Get Exp. Share5\nTo get an Exp.,How to Get Exp. Share5\nTo get an Exp.
2,how do you indent something?,leave a space before starting the writing,press the spacebar,"{indent, something}","{starting, indent, leave, space, writing, some...","{spacebar, press, indent, something}",[208399],[59145],"How to Dance2\nGet lined up.,\nPosition the ar...",How to Falcon Kick1\nPress B + down.
3,how do you shake something?,move it up and down and side to side quickly.,stir it very quickly.,"{shake, something}","{quickly, shake, move, side, something}","{shake, quickly, stir, something}",[],[],,
4,Clean tires,"Pour water, cape off caked on dirt. Use speed...","Pour water, scrape off caked on dirt. Use a st...","{clean, tires}","{caked, pour, use, water, speed, wool, crevice...","{caked, pour, use, water, wool, crevices, stee...","[203356, 21043, 33813, 187551, 199629]","[203356, 21043, 33813, 197807, 199629]","How to Clean Rubber Stamps1\nUse soapy water.,...","How to Clean Rubber Stamps1\nUse soapy water.,..."


In [114]:
train_with_knowledge = train_data[['goal', 'sol1', 'sol2', 
                                   'sol1_knowledge_idx', 'sol2_knowledge_idx', 
                                   'sol1_best_knowledge', 'sol2_best_knowledge']]
train_with_knowledge.to_json("../piqa_data/train-overlap.jsonl", orient='records', lines=True)

#### Repeat with Validation Set

In [115]:
valid = pd.read_json('../piqa_data/valid.jsonl', orient='records', lines=True)
valid.head()

Unnamed: 0,goal,sol1,sol2
0,How do I ready a guinea pig cage for it's new ...,Provide the guinea pig with a cage full of a f...,Provide the guinea pig with a cage full of a f...
1,dresser,replace drawer with bobby pin,"finish, woodgrain with bobby pin"
2,To fight Ivan Drago in Rocky for sega master s...,Drago isn't in this game because it was releas...,You have to defeat Apollo Creed and Clubber La...
3,Make outdoor pillow.,Blow into tin can and tie with rubber band.,Blow into trash bag and tie with rubber band.
4,ice box,will turn into a cooler if you add water to it,will turn into a cooler if you add soda to it


In [116]:
valid['goal_tokens'] = valid['goal'].apply(lambda s: sent2tokens(s, tokenizer, stopwords))
valid['goal_sol1_tokens'] = valid.apply(
    lambda r: r['goal_tokens'].union(sent2tokens(r['sol1'], tokenizer, stopwords)), axis=1)
valid['goal_sol2_tokens'] = valid.apply(
    lambda r: r['goal_tokens'].union(sent2tokens(r['sol2'], tokenizer, stopwords)), axis=1)
valid.head()

Unnamed: 0,goal,sol1,sol2,goal_tokens,goal_sol1_tokens,goal_sol2_tokens
0,How do I ready a guinea pig cage for it's new ...,Provide the guinea pig with a cage full of a f...,Provide the guinea pig with a cage full of a f...,"{ready, cage, new, guinea, occupants, pig}","{water, bedding, dish, ready, new, also, food,...","{water, bedding, dish, ready, new, jeans, mate..."
1,dresser,replace drawer with bobby pin,"finish, woodgrain with bobby pin",{dresser},"{drawer, bobby, replace, pin, dresser}","{bobby, finish, pin, dresser, woodgrain}"
2,To fight Ivan Drago in Rocky for sega master s...,Drago isn't in this game because it was releas...,You have to defeat Apollo Creed and Clubber La...,"{ivan, drago, fight, rocky, master, system, sega}","{released, ivan, drago, fight, iv, rocky, mast...","{ivan, lang, drago, fight, defeat, rocky, cree..."
3,Make outdoor pillow.,Blow into tin can and tie with rubber band.,Blow into trash bag and tie with rubber band.,"{pillow, outdoor, make}","{pillow, tin, make, blow, outdoor, band, tie, ...","{pillow, bag, make, blow, outdoor, trash, band..."
4,ice box,will turn into a cooler if you add water to it,will turn into a cooler if you add soda to it,"{box, ice}","{cooler, water, box, ice, add, turn}","{soda, cooler, box, ice, add, turn}"


In [117]:
import time
a = time.time()
valid['sol1_knowledge_idx'] = valid['goal_sol1_tokens'].apply(get_top_overlaps)
valid['sol2_knowledge_idx'] = valid['goal_sol2_tokens'].apply(get_top_overlaps)
b = time.time()
print(round(b-a, 2), 'sec')

635.45 sec


In [118]:
valid.head()

Unnamed: 0,goal,sol1,sol2,goal_tokens,goal_sol1_tokens,goal_sol2_tokens,sol1_knowledge_idx,sol2_knowledge_idx
0,How do I ready a guinea pig cage for it's new ...,Provide the guinea pig with a cage full of a f...,Provide the guinea pig with a cage full of a f...,"{ready, cage, new, guinea, occupants, pig}","{water, bedding, dish, ready, new, also, food,...","{water, bedding, dish, ready, new, jeans, mate...","[12402, 169134, 65609, 185220, 1348]","[12402, 65609, 1348, 185220, 1329]"
1,dresser,replace drawer with bobby pin,"finish, woodgrain with bobby pin",{dresser},"{drawer, bobby, replace, pin, dresser}","{bobby, finish, pin, dresser, woodgrain}",[],[]
2,To fight Ivan Drago in Rocky for sega master s...,Drago isn't in this game because it was releas...,You have to defeat Apollo Creed and Clubber La...,"{ivan, drago, fight, rocky, master, system, sega}","{released, ivan, drago, fight, iv, rocky, mast...","{ivan, lang, drago, fight, defeat, rocky, cree...",[165894],"[165894, 55026, 55025, 55024]"
3,Make outdoor pillow.,Blow into tin can and tie with rubber band.,Blow into trash bag and tie with rubber band.,"{pillow, outdoor, make}","{pillow, tin, make, blow, outdoor, band, tie, ...","{pillow, bag, make, blow, outdoor, trash, band...","[204891, 201779, 183496, 202389, 206281]","[91673, 79263, 204891, 91672, 201779]"
4,ice box,will turn into a cooler if you add water to it,will turn into a cooler if you add soda to it,"{box, ice}","{cooler, water, box, ice, add, turn}","{soda, cooler, box, ice, add, turn}","[185220, 157466, 118688, 159645, 134496]","[118688, 157476, 188953, 104011, 104012]"


In [119]:
valid['sol1_best_knowledge'] = valid['sol1_knowledge_idx'].apply(
    lambda l: wikihow.loc[l[0]]['content'] if l else "")
valid['sol2_best_knowledge'] = valid['sol2_knowledge_idx'].apply(
    lambda l: wikihow.loc[l[0]]['content'] if l else "")

In [126]:
valid_with_knowledge = valid[['goal', 'sol1', 'sol2', 
                                   'sol1_knowledge_idx', 'sol2_knowledge_idx', 
                                   'sol1_best_knowledge', 'sol2_best_knowledge']]
valid_with_knowledge.to_json("../piqa_data/valid-overlap.jsonl", orient='records', lines=True)

In [127]:
valid_with_knowledge.head()

Unnamed: 0,goal,sol1,sol2,sol1_knowledge_idx,sol2_knowledge_idx,sol1_best_knowledge,sol2_best_knowledge
0,How do I ready a guinea pig cage for it's new ...,Provide the guinea pig with a cage full of a f...,Provide the guinea pig with a cage full of a f...,"[12402, 169134, 65609, 185220, 1348]","[12402, 65609, 1348, 185220, 1329]",How to Care for Snails2\nGive the snail calciu...,How to Care for Snails2\nGive the snail calciu...
1,dresser,replace drawer with bobby pin,"finish, woodgrain with bobby pin",[],[],,
2,To fight Ivan Drago in Rocky for sega master s...,Drago isn't in this game because it was releas...,You have to defeat Apollo Creed and Clubber La...,[165894],"[165894, 55026, 55025, 55024]",How to Rename a Drive2\nRename a system drive.,How to Rename a Drive2\nRename a system drive.
3,Make outdoor pillow.,Blow into tin can and tie with rubber band.,Blow into trash bag and tie with rubber band.,"[204891, 201779, 183496, 202389, 206281]","[91673, 79263, 204891, 91672, 201779]",How to Make a Pillow Stand for iPad1\nDownload...,How to Be Green2\nPick up trash.
4,ice box,will turn into a cooler if you add water to it,will turn into a cooler if you add soda to it,"[185220, 157466, 118688, 159645, 134496]","[118688, 157476, 188953, 104011, 104012]",How to Install a New Dishwasher1\nTurn off the...,How to Stretch Your Back While Sitting2\nDo a ...


In [123]:
wikihow.loc[131755]

title                                      How to Make Irish Butter
headline          \nPour the Irish cream into a large clean jar....
content           How to Make Irish Butter\nPour the Irish cream...
content_tokens    {jar, pour, make, refrigerator, store, large, ...
Name: 131755, dtype: object

### archive

In [96]:
overlaps = wikihow['content_tokens'].apply(
    lambda x: len(question_option_tokens.intersection(x)) / len(question_option_tokens))
overlaps[overlaps > 0.75].sort_values()

index
87311     0.833333
125924    0.833333
126389    0.833333
129723    0.833333
130992    0.833333
132308    0.833333
Name: content_tokens, dtype: float64

In [98]:
overlaps2 = wikihow['content_tokens'].apply(
    lambda x: len(question_option_tokens.intersection(x)) / len(x))
overlaps2[overlaps2 > 0.2].sort_values(ascending=False)

index
196943    0.222222
168565    0.222222
158230    0.222222
169090    0.214286
131919    0.214286
Name: content_tokens, dtype: float64