# NLP Commonsense - Assignment 1

This file contains the code to generate the evaluations presented and explained in the report.

In [12]:
%load_ext autoreload
%autoreload 2

import itertools

import pandas as pd
from tqdm import tqdm

from process_examples import load_examples, extract_terms_from_example
from utils import load_conceptnet, normalize_conceptnet, normalize_input
from find_shortest_path import find_word_path, render_path_verbose, search_shortest_path

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
examples = load_examples() # load examples.txt in machine-readable form
conceptnet = load_conceptnet() # load preprocessed ConceptNet pickle 

## Extract Terms for Different Examples

In [3]:
extract_terms_from_example(examples[0])

({'bag',
  'baggage',
  'checked',
  'drawstring',
  'drawstring bag',
  'heading',
  'only baggage',
  'wa',
  'woman'},
 {'airport',
  'garbage',
  'jewelry',
  'jewelry store',
  'military',
  'safe',
  'store'})

In [4]:
extract_terms_from_example(examples[2])

({'enemy',
  'evidence',
  'happened',
  'his enemy',
  'incriminating',
  'man',
  'result',
  'uncovered'},
 {'avoided', 'blackmailed', 'enemy', 'his enemy', 'man'})

## Investige Path Search With Different Parametrs

In [10]:
find_word_path("safe", "baggage", conceptnet)

[]

In [6]:
# example for a path with length 4, but runs too long and is not really meaningful
find_word_path("safe", "baggage", conceptnet, max_path_len=4)

'safe --RelatedTo--> heavy <--RelatedTo-- carry --Antonym--> baggage'

In [7]:
find_word_path("airport", "baggage", conceptnet)

'airport <--AtLocation-- baggage'

In [8]:
find_word_path("airport", "baggage", conceptnet, renderer=render_path_verbose)

['airport (35496)',
 '/r/AtLocation (idx 1, weight 3.464, reversed),/r/AtLocation (idx 1, weight 2.828, reversed)',
 'baggage (121612)']

In [58]:
find_word_path("checked", "garbage", conceptnet, max_path_len=4)

'checked --HasContext--> north america --HasContext--> canada <--HasContext-- garbage'

In [59]:
find_word_path("safe", "baggage", conceptnet, max_path_len=4)

'safe --RelatedTo--> heavy <--RelatedTo-- carry --Antonym--> baggage'

In [63]:
find_word_path("punish", "away", conceptnet, max_path_len=4)

'punish --MotivatedByGoal--> mean <--RelatedTo-- away'

In [64]:
find_word_path("excited", "apply", conceptnet, max_path_len=4)

'excited <--HasSubevent-- score home run --HasPrerequisite--> play baseball --HasPrerequisite--> apply'

In [65]:
find_word_path("harvard", "letter", conceptnet, max_path_len=4)

'harvard --IsA--> college --HasContext--> canada <--HasContext-- letter'

## Extract Paths for All Examples

In [54]:
results = []

for example_idx, example in enumerate(tqdm(examples), start=1):
    question_context, choices = extract_terms_from_example(example)

    for tq in question_context:
        for tc in choices:
            p = find_word_path(tq, tc, conceptnet)

            results.append(
                {
                    "example": example_idx,
                    "question_context": tq,
                    "choices": tc,
                    "path": p,
                }
            )


100%|██████████| 10/10 [07:08<00:00, 42.82s/it]


In [55]:
path_df = pd.DataFrame(results)

In [56]:
path_df

Unnamed: 0,example,question_context,choices,path
0,1,drawstring,military,drawstring --PartOf--> drawstring bag --AtLoca...
1,1,drawstring,jewelry,[]
2,1,drawstring,jewelry store,drawstring --PartOf--> drawstring bag --AtLoca...
3,1,drawstring,safe,drawstring --PartOf--> drawstring bag --AtLoca...
4,1,drawstring,store,drawstring --PartOf--> drawstring bag --AtLoca...
...,...,...,...,...
804,10,water,lift,water --HasContext--> dialectal <--HasContext-...
805,10,water,bottle,water --AtLocation--> bottle
806,10,water,suction,[]
807,10,water,press,water <--RelatedTo-- cast <--MannerOf-- press


In [57]:
path_df.index.name = "index"
path_df.to_csv("../data/processed/paths_for_examples_after_article_fix.csv")

## Additional Evaluations

The following evaluations were not directly parts of the assignments tasks, but provided useful 
additional information.

### Investigate Suffixes of Node Labels

In [None]:
df = pd.read_csv("../data/processed/en_edges.csv")

In [None]:
"/c/en/0/n".split("/")

['', 'c', 'en', '0', 'n']

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,uri,label,start,end,info
0,0,"/a/[/r/Antonym/,/c/en/0/n/,/c/en/1/]",/r/Antonym,/c/en/0/n,/c/en/1,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc..."
1,1,"/a/[/r/Antonym/,/c/en/12_hour_clock/n/,/c/en/2...",/r/Antonym,/c/en/12_hour_clock/n,/c/en/24_hour_clock,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."
2,2,"/a/[/r/Antonym/,/c/en/24_hour_clock/n/,/c/en/1...",/r/Antonym,/c/en/24_hour_clock/n,/c/en/12_hour_clock,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."
3,3,"/a/[/r/Antonym/,/c/en/5/n/,/c/en/3/]",/r/Antonym,/c/en/5/n,/c/en/3,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."
4,4,"/a/[/r/Antonym/,/c/en/a.c/n/,/c/en/d.c/]",/r/Antonym,/c/en/a.c/n,/c/en/d.c,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc..."


In [None]:
set(parts[-1] for name in df.start.unique() if len(parts := name.split("/")) == 5)

{'a', 'n', 'r', 'v'}

### Create a List of All Unmatched Terms

In [50]:
oov_list = []

for example_idx, example in enumerate(examples, start=1):
    question_context, choices = extract_terms_from_example(example)

    oov = [t for t in itertools.chain(question_context, choices) if normalize_input(t) not in conceptnet.nodes_name2idx]

    oov_list.append({"example": example_idx, "out_of_vocab": ",".join(oov)})




In [51]:
oov_df = pd.DataFrame(oov_list)

In [53]:
oov_df.to_csv("../data/processed/oov_after_article_fix.csv")

In [52]:
oov_df

Unnamed: 0,example,out_of_vocab
0,1,only baggage
1,2,
2,3,"his enemy,his enemy"
3,4,"his house,his parents,alex's parents"
4,5,skylar
5,6,giant pile
6,7,"not lawrence,interested kevin"
7,8,"so much money,$10,000 debt,$,next sentence,his..."
8,9,"l.,st.,l. mark bailey,st. paul,45-acre horse f..."
9,10,


### Create a List of the Most Common Relations

In [5]:
df = pd.read_csv("../data/processed/en_edges.csv")

In [7]:
df.label.value_counts()

/r/RelatedTo                    1703582
/r/FormOf                        378859
/r/DerivedFrom                   325374
/r/HasContext                    232935
/r/IsA                           230137
/r/Synonym                       222156
/r/UsedFor                        39790
/r/EtymologicallyRelatedTo        32075
/r/SimilarTo                      30280
/r/AtLocation                     27797
/r/HasSubevent                    25238
/r/HasPrerequisite                22710
/r/CapableOf                      22677
/r/Antonym                        19066
/r/Causes                         16801
/r/PartOf                         13077
/r/MannerOf                       12715
/r/MotivatedByGoal                 9489
/r/HasProperty                     8433
/r/ReceivesAction                  6037
/r/HasA                            5545
/r/CausesDesire                    4688
/r/dbpedia/genre                   3824
/r/HasFirstSubevent                3347
/r/DistinctFrom                    3315
