# Playing with CSKG grounding

# Setup for grounding

```
conda create -n mowgli python=3.6 
conda activate mowgli

git clone https://github.com/ucinlp/mowgli-uci

mv mowgli-uci grounding

cd grounding

pip install -r requirements.txt
conda install --yes faiss-cpu -c pytorch -n mowgli
python -m spacy download en_core_web_lg

cd ..
```

In [None]:
# IMPORTS
import pygraphviz as pgv
from IPython.display import Image

## I. Parsing questions and answers

In [None]:
import sys
sys.path.append('..')
from groundcn.graphify import graphify

In [None]:
def draw(G):
    return Image(G.draw(format='png', prog='dot'))

In [None]:
sentences=[
    'Max looked for the onions so that he could make a stew.',
#    'To get the bathroom counters dry after washing your face, take a small hand lotion and wipe away the extra water around the sink.',
#    'To get the bathroom counters dry after washing your face, take a small hand towel and wipe away the extra water around the sink.'
]

In [None]:
parse_trees=graphify.graphify_dataset(sentences)

In [None]:
parse_trees

### Nicer visualization

In [None]:
rels=[]
for sent_data in parse_trees:
    #print('Sentence:', sent_data['sentence'])
    #print('Tokenized sentence', sent_data['tokenized_sentence'])

    G=pgv.AGraph(strict=False, directed=True)
    
    nodes={}
    for n_id, n_data in sent_data['nodes'].items():
        nodes[n_id]=' '.join(n_data['phrase'])
    
    for e_id, e_data in sent_data['edges'].items():
        n1=nodes[e_data['head_node_id']]
        n2=nodes[e_data['tail_node_id']]
        rel=e_data['edge_name']
        edge=(e_data['head_node_id'], rel, e_data['tail_node_id'])
        rels.append(edge)
        
        G.add_edge(n1, n2, label=rel)
        

In [None]:
draw(G)

## II. Grounding questions and questions to ConceptNet

In [None]:
from groundcn.graphify import link

**Note:** The Numberbatch file should be downloaded from [here](https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-19.08.txt.gz), placed into the directory `../output/embeddings`, and gunzip-ed.

In [None]:
numberbatch_file='../output/embeddings/numberbatch-en-19.08.txt'

In [None]:
linked_data=link.link(parse_trees, embedding_file=numberbatch_file)

In [None]:
rels

In [None]:
links={}
for sent_data in linked_data:
    print('Sentence:', sent_data['sentence'])
    linkedG=pgv.AGraph(strict=False, directed=True)
    for n_id, n_data in sent_data['nodes'].items():
        print('Node phrase:', n_data['phrase'])
        for c in reversed(n_data['candidates']):
            print(c)
        print()
        
        links[n_id]=list(reversed(n_data['candidates']))[0]['uri']

    for edge in rels:
        linkedG.add_edge(links[edge[0]], links[edge[2]], label=edge[1])
    print()

In [None]:
draw(linkedG)

## III. Grounding to CSKG

In [None]:
from groundcskg.graphify import link

**Note:** The BERT embeddings file should be downloaded from [here](https://drive.google.com/file/d/1o2mSa_71X6hXZETPSn6-dotltKoI2QF_/view?usp=sharing), placed into the directory `../output/embeddings`, and gunzip-ed.

In [None]:
#graph_emb_file='../output/embeddings/graph_embedding.tsv'
bert_file='../output/embeddings/bert_nli_large_w2v_format.txt'

In [None]:
import importlib
importlib.reload(link)

In [None]:
linked_data=link.link(parse_trees, embedding_file=bert_file)

In [None]:
links={}
for sent_data in linked_data:
    print('Sentence:', sent_data['sentence'])
    linkedG=pgv.AGraph(strict=False, directed=True)
    for n_id, n_data in sent_data['nodes'].items():
        print('Node phrase:', n_data['phrase'])
        for c in reversed(n_data['candidates']):
            print(c)
        print()
        
        links[n_id]=list(reversed(n_data['candidates']))[0]['uri']

    for edge in rels:
        linkedG.add_edge(links[edge[0]], links[edge[2]], label=edge[1])
    print()

In [None]:
draw(linkedG)