In [3]:
import sys
sys.path.append("../src")

In [4]:
%load_ext autoreload
%autoreload 2

import datasets
from tqdm import tqdm
import pandas as pd

from process_examples import extract_terms
from utils import load_conceptnet, normalize_conceptnet, normalize_input
from find_shortest_path import find_word_path, search_shortest_path

[nltk_data] Downloading package wordnet to /home/felix/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/felix/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
cqa = datasets.load_dataset("commonsense_qa")

Downloading: 4.16kB [00:00, 1.60MB/s]                   
Downloading: 2.31kB [00:00, 1.26MB/s]                   
Using custom data configuration default


Downloading and preparing dataset commonsense_qa/default (download: 4.46 MiB, generated: 2.08 MiB, post-processed: Unknown size, total: 6.54 MiB) to /home/felix/.cache/huggingface/datasets/commonsense_qa/default/0.1.0/1ca2d7b680c5bd93c0dc85f9cb65c0c8817e759ff82e405b28de54e83efa80f7...


Downloading: 100%|██████████| 3.79M/3.79M [00:03<00:00, 1.14MB/s]
Downloading: 100%|██████████| 472k/472k [00:00<00:00, 931kB/s]
Downloading: 100%|██████████| 423k/423k [00:00<00:00, 916kB/s]
100%|██████████| 3/3 [00:07<00:00,  2.52s/it]
100%|██████████| 3/3 [00:00<00:00, 763.57it/s]
                                          

Dataset commonsense_qa downloaded and prepared to /home/felix/.cache/huggingface/datasets/commonsense_qa/default/0.1.0/1ca2d7b680c5bd93c0dc85f9cb65c0c8817e759ff82e405b28de54e83efa80f7. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 285.82it/s]


In [12]:
conceptnet = load_conceptnet()

In [46]:
cqa["train"][4]

{'answerKey': 'C',
 'question': 'The fox walked from the city into the forest, what was it looking for?',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['pretty flowers.',
   'hen house',
   'natural habitat',
   'storybook',
   'dense forest']}}

In [16]:
from collections import Counter

In [17]:
terms = Counter()

In [19]:
for q in tqdm(cqa["train"]["question"]):
    terms.update(extract_terms(q))

100%|██████████| 9741/9741 [01:43<00:00, 94.19it/s] 


In [24]:
len(terms)*0.2

2697.4

In [35]:
relevant_terms = terms.most_common()[200:1200]
len(relevant_terms)

1000

In [37]:
triples = []

for term, _ in tqdm(relevant_terms):
    if term not in conceptnet.nodes_name2idx:
        continue

    idx = conceptnet.nodes_name2idx[term]
    neighbour_idxs = conceptnet.adjacency_lists[idx]

    for n_idx in neighbour_idxs:
        if (idx, n_idx) not in conceptnet.edge_descriptors:
            continue

        descs = conceptnet.edge_descriptors[(idx, n_idx)]

        for desc in descs:
            label = conceptnet.labels_idx2name[desc.label_idx]

            triples.append({
                "from": term,
                "to": conceptnet.nodes_idx2name[n_idx],
                "label": label,
                "weight": desc.weight
            })

100%|██████████| 1000/1000 [00:00<00:00, 2571.91it/s]


In [38]:
len(triples)

120579

In [63]:
triples_df = pd.DataFrame(triples)

In [62]:
questions_map = {
    "/r/AtLocation": "Where are you likely to find {0}?",
    "/r/UsedFor": "What is {0} used for?",
    "/r/PartOf": "What can {0} be a part of?",
    "/r/CapableOf": "What can {0} do?",
    "/r/SimilarTo": "What is {0} similar to?"
}

In [68]:
triples_df = triples_df.query("label in @questions_map")
triples_df = triples_df.query("`from` != `to`")

In [69]:
triples_df["from"].nunique()

741

In [70]:
selected_triples = triples_df.sort_values("weight", ascending=False).drop_duplicates("from")

In [71]:
selected_triples

Unnamed: 0,from,to,label,weight
14810,fish,water,/r/AtLocation,11.489
22791,shark,ocean,/r/AtLocation,11.314
3066,alcohol,cloud judgement,/r/CapableOf,10.000
28115,clothing,closet,/r/AtLocation,10.000
53397,telephone,desk,/r/AtLocation,9.381
...,...,...,...,...
43338,effect,include vomiting,/r/CapableOf,1.000
41153,hope,bring fortune,/r/CapableOf,1.000
108972,york,england,/r/PartOf,0.500
94687,today,manhattan,/r/AtLocation,0.500


In [72]:
import random

In [78]:
triples_df.query("`from` == 'fish' and `to` == 'water' and label == '/r/AtLocation'")

Unnamed: 0,from,to,label,weight
14810,fish,water,/r/AtLocation,11.489
14811,fish,water,/r/AtLocation,1.0


In [85]:
selected_triples["to"].sample(1).values[0]

'breadbox'

In [86]:
questions = []

for idx, triple in tqdm(selected_triples.iterrows()):
    
    keyword = triple["from"]
    relation = triple["label"]
    correct_answer = triple["to"]
    wrong_answers = set()

    while len(wrong_answers) < 4:
        answer = selected_triples["to"].sample(1).values[0]

        # make sure that the wrong answer is not right
        if (answer != correct_answer 
            and triples_df.query("`from` == @keyword and `to` == @answer and label == @relation").empty):
            wrong_answers.add(answer)

    question = questions_map[triple["label"]].format(triple["from"])
    choice_labels = ["A", "B", "C", "D", "E"]
    answer_idx = random.choice(range(5))
    choices_text = list(wrong_answers)
    choices_text.insert(answer_idx, correct_answer)

    questions.append({
        "answerKey": choice_labels[answer_idx],
        "question": question,
        "choices": {
            "label": choice_labels,
            "text": choices_text
        }
    })

741it [00:20, 36.68it/s]


In [89]:
random.shuffle(questions)

In [91]:
len(questions)

741

In [94]:
741*0.9

666.9

In [95]:
train_questions = questions[:667]
val_questions = questions[667:]

In [96]:
train_questions_df = pd.DataFrame(train_questions)

In [98]:
train_ds = datasets.Dataset.from_pandas(train_questions_df, split="train")

In [100]:
val_ds = datasets.Dataset.from_pandas(pd.DataFrame(val_questions), split="validation")

In [107]:
val_ds[:5]

{'answerKey': ['D', 'E', 'B', 'B', 'D'],
 'question': ['What can leaf be a part of?',
  'What is noise used for?',
  'What is poor similar to?',
  'What is standing similar to?',
  'What can idea do?'],
 'choices': [{'label': ['A', 'B', 'C', 'D', 'E'],
   'text': ['bring fortune',
    'holding',
    'wet diaper',
    'tree',
    'getting on top']},
  {'label': ['A', 'B', 'C', 'D', 'E'],
   'text': ['prosperous',
    'gregarious',
    'scene',
    'bottom out',
    'get attention']},
  {'label': ['A', 'B', 'C', 'D', 'E'],
   'text': ['cool drink', 'stone broke', 'whole', 'at peace', 'house']},
  {'label': ['A', 'B', 'C', 'D', 'E'],
   'text': ['regular exercise', 'erect', 'deal blow', 'beautiful', 'musical']},
  {'label': ['A', 'B', 'C', 'D', 'E'],
   'text': ['league',
    'repair shop',
    'perplexed',
    'divide countries',
    'cupboard']}]}

In [101]:
questions_ds = datasets.DatasetDict({"train": train_ds, "validation": val_ds})

In [102]:
questions_ds

DatasetDict({
    train: Dataset({
        features: ['answerKey', 'question', 'choices'],
        num_rows: 667
    })
    validation: Dataset({
        features: ['answerKey', 'question', 'choices'],
        num_rows: 74
    })
})

In [103]:
questions_ds.save_to_disk("conceptnet_generated_questions")

In [5]:
ds = datasets.DatasetDict.load_from_disk("conceptnet_generated_questions")

In [8]:
import json

In [None]:
datasets.DatasetDict.

In [7]:
datasets.__version__

'1.18.3'

In [6]:
ds

DatasetDict({
    train: Dataset({
        features: ['answerKey', 'question', 'choices'],
        num_rows: 667
    })
    validation: Dataset({
        features: ['answerKey', 'question', 'choices'],
        num_rows: 74
    })
})

In [12]:
ds["train"].to_json("conceptnet_ds_train.json")
ds["validation"].to_json("conceptnet_ds_validation.json")

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 35.41ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 325.77ba/s]


12534

In [13]:
ds = datasets.DatasetDict({"train": datasets.Dataset.from_json("conceptnet_ds_train.json"), 
                    "validation": datasets.Dataset.from_json("conceptnet_ds_validation.json")})

Using custom data configuration default-34e778201200c26f


Downloading and preparing dataset json/default to /home/felix/.cache/huggingface/datasets/json/default-34e778201200c26f/0.0.0...


100%|██████████| 1/1 [00:00<00:00, 2910.69it/s]
100%|██████████| 1/1 [00:00<00:00, 270.58it/s]
Using custom data configuration default-586bff775a291fe8


Dataset json downloaded and prepared to /home/felix/.cache/huggingface/datasets/json/default-34e778201200c26f/0.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset json/default to /home/felix/.cache/huggingface/datasets/json/default-586bff775a291fe8/0.0.0...


100%|██████████| 1/1 [00:00<00:00, 1777.25it/s]
100%|██████████| 1/1 [00:00<00:00, 662.71it/s]

Dataset json downloaded and prepared to /home/felix/.cache/huggingface/datasets/json/default-586bff775a291fe8/0.0.0. Subsequent calls will reuse this data.





In [14]:
ds

DatasetDict({
    train: Dataset({
        features: ['answerKey', 'question', 'choices'],
        num_rows: 667
    })
    validation: Dataset({
        features: ['answerKey', 'question', 'choices'],
        num_rows: 74
    })
})

In [99]:
cqa

DatasetDict({
    train: Dataset({
        features: ['answerKey', 'question', 'choices'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['answerKey', 'question', 'choices'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['answerKey', 'question', 'choices'],
        num_rows: 1140
    })
})

In [97]:
train_questions_df

Unnamed: 0,answerKey,question,choices
0,B,What is dish used for?,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [..."
1,B,Where are you likely to find furniture?,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [..."
2,D,What is loud similar to?,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [..."
3,A,What is million similar to?,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [..."
4,E,Where are you likely to find fish?,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [..."
...,...,...,...
662,D,What is unlikely similar to?,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [..."
663,B,What is expected similar to?,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [..."
664,E,Where are you likely to find singing?,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [..."
665,C,What is stuck similar to?,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [..."


In [90]:
questions[5:10]

[{'answerKey': 'E',
  'question': 'What is cool similar to?',
  'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
   'text': ['stairway', 'hole', 'cooking food', 'matter to voters', 'cold']}},
 {'answerKey': 'E',
  'question': 'Where are you likely to find bedroom?',
  'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
   'text': ['nonprofessional',
    'locker room',
    'finger',
    'make love',
    'house']}},
 {'answerKey': 'C',
  'question': 'What is fishing used for?',
  'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
   'text': ['closely held', 'productive', 'catching fish', 'up', 'living']}},
 {'answerKey': 'A',
  'question': 'What is yard used for?',
  'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
   'text': ['playing', 'page employee', 'decade', 'life', 'house']}},
 {'answerKey': 'C',
  'question': 'What can general do?',
  'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
   'text': ['sporting event',
    'light room',
    'lead army',
    'market',
    'absorbent']}}]

In [None]:
{'answerKey': 'C',
 'question': 'The fox walked from the city into the forest, what was it looking for?',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['pretty flowers.',
   'hen house',
   'natural habitat',
   'storybook',
   'dense forest']}}

In [None]:
selected_triples["wrong_answers"] = selected_triples["to"].sample()

In [51]:
triples_df.label.value_counts()

/r/AtLocation    6327
/r/UsedFor       5736
/r/Synonym       5662
/r/CapableOf     3865
/r/SimilarTo     1593
Name: label, dtype: int64

In [34]:
relevant_terms[:20]

[('product', 40),
 ('many people', 40),
 ('committing', 40),
 ('standing', 40),
 ('box', 39),
 ('parent', 39),
 ('glass', 39),
 ('running', 39),
 ('common', 39),
 ('different', 39),
 ('horse', 39),
 ('year', 38),
 ('meeting', 38),
 ('drunk', 38),
 ('hour', 38),
 ('alcohol', 38),
 ('potato', 38),
 ('plant', 38),
 ('seeing', 38),
 ('sleep', 38)]

In [21]:
terms.most_common(20)

[('wa', 1661),
 (' ', 942),
 ('people', 752),
 ('likely', 698),
 ('find', 575),
 ('person', 549),
 ('doe', 510),
 ('want', 368),
 ('what', 350),
 ('place', 303),
 ('wanted', 270),
 ('need', 263),
 ('john', 242),
 ('time', 238),
 ('ha', 219),
 ('like', 219),
 ('found', 212),
 ('work', 212),
 ('james', 208),
 ('good', 208)]

In [22]:
terms.most_common()[:-21:-1]

[('my ground pump', 1),
 ('electricity bill', 1),
 ('pump', 1),
 ('indoor merchants', 1),
 ('major city', 1),
 ('public transit', 1),
 ('transit', 1),
 ('cooled', 1),
 ('likely encounter', 1),
 ('insight', 1),
 ("director's commentary", 1),
 ('ones', 1),
 ('all varieties', 1),
 ('two year college degrees', 1),
 ('associates', 1),
 ('crossword puzzle', 1),
 ('ceo', 1),
 ("ceo's curiosity", 1),
 ("billy's disability", 1),
 ('alter', 1)]

In [15]:
cqa["train"]["question"]

['The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?',
 'Sammy wanted to go to where the people were.  Where might he go?',
 'To locate a choker not located in a jewelry box or boutique where would you go?',
 'Google Maps and other highway and street GPS services have replaced what?',
 'The fox walked from the city into the forest, what was it looking for?',
 'What home entertainment equipment requires cable?',
 'The only baggage the woman checked was a drawstring bag, where was she heading with it?',
 'The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?',
 'What do people use to absorb extra ink from a fountain pen?',
 'Where is a business restaurant likely to be located?',
 'Where do you put your grapes just before checking out?',
 'Before getting a divorce, what did the wife feel who was doing all the work?',
 'Johnny sat on a bench and relaxed after doing a lot of 

In [3]:
cqa

DatasetDict({
    train: Dataset({
        features: ['answerKey', 'question', 'choices'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['answerKey', 'question', 'choices'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['answerKey', 'question', 'choices'],
        num_rows: 1140
    })
})