# Generate Knowlege Graph With Trained Model

## Retrive Triplets
Load trained model

In [None]:
import os
import json

import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm

In [None]:
torch.__version__

In [None]:
def load_model(model_dir, epoch, device):
    s_m = torch.load(os.path.join(model_dir, "s_{}.pkl".format(epoch)), map_location=device)
    po_m = torch.load(os.path.join(model_dir, "po_{}.pkl".format(epoch)), map_location=device)
    # reload the model with DataParallel (this will 
    # be helpful when num of GPUs changes)
    s_m = nn.DataParallel(s_m.module)
    po_m = nn.DataParallel(po_m.module)
    return s_m, po_m

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model_epoch = 210
model_dir = 'models_real'
s_m, po_m = load_model(model_dir, model_epoch, device)

Extract triplets from dev data (currently )

In [None]:
# This is a bad sequential implementation
def extract_items(text_in, s_m, po_m):
    R = []
    _s = [char2id.get(c, 1) for c in text_in]
    _s = np.array([_s])
    _k1, _k2, t, t_max, mask = s_m(torch.LongTensor(_s).to(device))
    _k1, _k2 = _k1[0, :, 0], _k2[0, :, 0]
    _kk1s = []
    for i, _kk1 in enumerate(_k1):
        if _kk1 > 0.5:
            _subject = ''
            for j, _kk2 in enumerate(_k2[i:]):
                if _kk2 > 0.5:
                    _subject = text_in[i: i+j+1]
                    break
            if _subject:
                _k1, _k2 = torch.LongTensor([[i]]), torch.LongTensor(
                    [[i+j]])  # np.array([i]), np.array([i+j])
                _o1, _o2 = po_m(t.to(device), t_max.to(
                    device), _k1.to(device), _k2.to(device))
                _o1, _o2 = _o1.cpu().data.numpy(), _o2.cpu().data.numpy()

                _o1, _o2 = np.argmax(_o1[0], 1), np.argmax(_o2[0], 1)

                for i, _oo1 in enumerate(_o1):
                    if _oo1 > 0:
                        for j, _oo2 in enumerate(_o2[i:]):
                            if _oo2 == _oo1:
                                _object = text_in[i: i+j+1]
                                _predicate = id2predicate[_oo1]
                                # print((_subject, _predicate, _object))
                                R.append((_subject, _predicate, _object))
                                break
        _kk1s.append(_kk1.data.cpu().numpy())
    _kk1s = np.array(_kk1s)
    return list(set(R))

Load dev data, and corresponding schemas

In [None]:
dev_path = 'generated/dev_data_me.json'
dev_data = json.load(open(dev_path))
generated_char_path = 'generated/all_chars_me.json'
id2char, char2id = json.load(open(generated_char_path))
generated_schema_path =  'generated/schemas_me.json'
id2predicate, predicate2id = json.load(open(generated_schema_path))
id2predicate = {int(i): j for i, j in id2predicate.items()}

Write to `pandas` frame first, then write to a csv file.

In [None]:
import pandas as pd
import csv

In [None]:
df = pd.DataFrame({'subject':[], 'predicate':[], 'object':[]})
for d in tqdm(iter(dev_data)):
    items = extract_items(d['text'], s_m, po_m)
    for item in items:
        df.loc[len(df)] = item

print("num of extracted relations from dev set is:", len(df))

In [None]:
df.to_csv('generated/triplets.csv', index=False, header=False)

Create knowledge graph with saved triplets

In [None]:
# check the length of existing predicates
len(set(df['predicate']))

### Create relation dictionary

In [None]:
rel_dict = {}
schema_path = 'data/schema.json'
with open(schema_path) as f:
    for l in tqdm(f):
        rel = json.loads(l)
        #schemas.add(a['predicate'])
        predicate = rel['predicate']
        sub_type = rel['subject_type']
        obj_type = rel['object_type']['@value']
        rel_dict[predicate] = {'subject_type': sub_type, 'object_type': obj_type}

In [None]:
rel_df = df

ID is currently constructed in a very simple way:
```python
node_id = 'node_' + node_type + '_' + node_name
edge_id = 'edge_' + predicate + '_' + from + '_' + to
```

In [None]:
node_df = pd.DataFrame({'~id':[], '~label':[], 'name': []})
edge_df = pd.DataFrame({'~id':[], '~from':[], '~to':[], '~label':[]})

node_dict = {}

# currently id is constructed naively.
def node_name2id(entity_type, entity_name):
    return 'node_' + entity_type + '_' + entity_name

for idx, row in rel_df.iterrows():
    sub = row['subject']
    obj = row['object']
    rel = row['predicate']
    sub_type = rel_dict[rel]['subject_type']
    obj_type = rel_dict[rel]['object_type']
    sub_id = 'node_' + sub_type + '_' + sub
    obj_id = 'node_' + obj_type + '_' + obj
    # order matter: ~id, ~label, name
    node_dict[sub_id] = [sub_type, sub]
    node_dict[obj_id] = [obj_type, obj]
    edge_id = 'edge_' + rel + '_' + sub_id + '_' + obj_id
    edge_df.loc[len(edge_df)] = [edge_id, sub_id, obj_id, rel]
    
for key, val in node_dict.items():
    node_df.loc[len(node_df)] = [key, val[0], val[1]]  

print("We have scanned {} nodes and {} relations".format(len(node_df), len(edge_df)))

Save nodes and relations to csv file and upload it to Amazon S3.

In [None]:
node_df.to_csv('generated/nodes.csv', index=False)
edge_df.to_csv('generated/edges.csv', index=False)

In [None]:
%%bash

export S3_SAVE_BUCKET="sm-nlp-data"
export SAVE_PATH="ie-baseline/outputs"
aws s3 cp ./generated/edges.csv s3://$S3_SAVE_BUCKET/$SAVE_PATH/edges.csv
aws s3 cp ./generated/nodes.csv s3://$S3_SAVE_BUCKET/$SAVE_PATH/nodes.csv

echo "The path for the Property Graph bulk loading step is 's3://$S3_SAVE_BUCKET/$SAVE_PATH/'"

## Load Graph Data into Neptune
- Neptune endpoint & port: database-1-instance-1.c2ycbhkszo5s.us-east-1.neptune.amazonaws.com:8182 [info](https://console.aws.amazon.com/neptune/home?region=us-east-1#database:id=database-1-instance-1;is-cluster=false;tab=connectivity)
- Source:
    - s3://sm-nlp-data/ie-baseline/outputs/nodes.csv
    - s3://sm-nlp-data/ie-baseline/outputs/edges.csv
- IAM role ARN: arn:aws:iam::093729152554:role/service-role/AWSNeptuneNotebookRole-NepTestRole [link](https://console.aws.amazon.com/iam/home?region=us-east-1#/roles/AWSNeptuneNotebookRole-NepTestRole)



In [None]:
%%bash

curl -X POST \
    -H 'Content-Type: application/json' \
    https://database-1-instance-1.c2ycbhkszo5s.us-east-1.neptune.amazonaws.com:8182/loader -d '
    {
      "source" : "s3://sm-nlp-data/ie-baseline/outputs/",
      "format" : "csv",
      "iamRoleArn" : "arn:aws:iam::093729152554:role/aws-service-role/rds.amazonaws.com/AWSServiceRoleForRDS",
      "region" : "us-east-1",
      "failOnError" : "FALSE",
      "parallelism" : "MEDIUM",
      "updateSingleCardinalityProperties" : "FALSE",
      "queueRequest" : "TRUE",
      "dependencies" : []
    }'

In [None]:
!aws configure list

In [None]:
%status

In [None]:
%%gremlin
g.V()

In [None]:
%graph_notebook_config