<a href="https://colab.research.google.com/github/vrublevskiyvitaliy/paraphrase_identification/blob/master/Dependancy_graph_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# All imports and installs should be here
import os
import numpy as np
from nltk import wordpunct_tokenize
import operator
import re, string
import math
import spacy
from nltk import Tree


In [0]:
# All constants

COLAB_ENV = "colab"
LOCAL_ENV = "local"

SENTENCE_START_TOKEN = "sentence_start"
SENTENCE_END_TOKEN = "sentence_end"
UNKNOWN_TOKEN = "unknown_token"


In [0]:
# To be able run this notebook from Google Colab and localy

def get_running_env():
  current_path = os.getcwd()
  if current_path == "/content":
    return COLAB_ENV
  return LOCAL_ENV

RUNNING_ENV = get_running_env()

In [0]:
# Supress output of the cell
%%capture
def download_corpus():
    """
      Downloading corpus files for colab research.
    """ 
    if RUNNING_ENV == LOCAL_ENV:
      return
    files = [
      'vrublevskiyvitaliy/paraphrase_identification/contents/dataset/msr-paraphrase-corpus/msr_paraphrase_train.txt',
      'vrublevskiyvitaliy/paraphrase_identification/contents/dataset/msr-paraphrase-corpus/msr_paraphrase_test.txt',
    ]
    for f in files:
       !curl --remote-name \
          -H 'Accept: application/vnd.github.v3.raw' \
          --location https://api.github.com/repos/{f}

download_corpus()

In [0]:
def get_data_location():
  return "" if RUNNING_ENV == COLAB_ENV else "./dataset/msr-paraphrase-corpus/"

def add_start_end_sentence_tokens(s):
  return "%s %s %s" % (SENTENCE_START_TOKEN, s, SENTENCE_END_TOKEN)

def load_data(_preprocess_sentence=None, _train=False, _test=False):
    "Load the MSRP dataset."
    loc = get_data_location()
    trainloc = loc + 'msr_paraphrase_train.txt'
    testloc = loc + 'msr_paraphrase_test.txt'

    if _preprocess_sentence is None:
      _preprocess_sentence = lambda x: x

    sent1_train, sent2_train, sent1_test, sent2_test = [], [], [], []
    label_train, label_dev, label_test = [], [], []

    if _train:
        with open(trainloc, 'r', encoding='utf8') as f:
            f.readline()  # skipping the header of the file
            for line in f:
                text = line.strip().split('\t')
                sent1_train.append(_preprocess_sentence(text[3]))
                sent2_train.append(_preprocess_sentence(text[4]))
                label_train.append(int(text[0]))

    if _test:
        with open(testloc, 'r', encoding='utf8') as f:
            f.readline()  # skipping the header of the file
            for line in f:
                text = line.strip().split('\t')
                sent1_test.append(_preprocess_sentence(text[3]))
                sent2_test.append(_preprocess_sentence(text[4]))
                label_test.append(int(text[0]))

    if _train and _test:
        return [sent1_train, sent2_train], [sent1_test, sent2_test], [label_train, label_test]
    elif _train:
        return [sent1_train, sent2_train], label_train
    elif _test:
        return [sent1_test, sent2_test], label_test

In [0]:
data = load_data(_preprocess_sentence=None, _train=True, _test=False)

In [0]:
def get_sample_data():
  all_data = load_data(_preprocess_sentence=None, _train=True, _test=False)
  return all_data[0][0][0], all_data[0][1][0],  all_data[1][0]

In [0]:
sample = get_sample_data()
sample

('Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.',
 'Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.',
 1)

In [69]:


en_nlp = spacy.load('en')

def get_dependancy_graph(s, display=False):
  doc = en_nlp(s)
  if display:
    spacy.displacy.render(doc, style="dep", jupyter=True)
  edges = []
  nodes = ["ROOT"]
  for token in doc:
    nodes.append(token.text)
    if token.dep_ == "ROOT":
      edges.append({
        "start": "ROOT",
        "end": token.text,
        "type": token.dep_
      })
    else:
      edges.append({
        "start": token.head.text,
        "end": token.text,
        "type": token.dep_
      })
  return nodes, edges

graph1 = get_dependancy_graph(sample[0], True)
print(graph1)



(['ROOT', 'Amrozi', 'accused', 'his', 'brother', ',', 'whom', 'he', 'called', '"', 'the', 'witness', '"', ',', 'of', 'deliberately', 'distorting', 'his', 'evidence', '.'], [{'start': 'accused', 'end': 'Amrozi', 'type': 'nsubj'}, {'start': 'ROOT', 'end': 'accused', 'type': 'ROOT'}, {'start': 'brother', 'end': 'his', 'type': 'poss'}, {'start': 'accused', 'end': 'brother', 'type': 'dobj'}, {'start': 'brother', 'end': ',', 'type': 'punct'}, {'start': 'called', 'end': 'whom', 'type': 'dobj'}, {'start': 'called', 'end': 'he', 'type': 'nsubj'}, {'start': 'brother', 'end': 'called', 'type': 'relcl'}, {'start': 'witness', 'end': '"', 'type': 'punct'}, {'start': 'witness', 'end': 'the', 'type': 'det'}, {'start': 'called', 'end': 'witness', 'type': 'oprd'}, {'start': 'witness', 'end': '"', 'type': 'punct'}, {'start': 'accused', 'end': ',', 'type': 'punct'}, {'start': 'accused', 'end': 'of', 'type': 'prep'}, {'start': 'distorting', 'end': 'deliberately', 'type': 'advmod'}, {'start': 'of', 'end': '