<a href="https://colab.research.google.com/github/zehavitc/NLP/blob/master/nlp_hw4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from collections import deque

In [0]:
def conll_to_transitions(sentence):
    '''
    Given a sentence, returns a list of transitions.
    Each transition is a training instance for your classifier. A transition 
    is composed of the following 4 items:
    - first word in stack
    - second word in stack (could be None is stack is of size=1)
    - first word in buffer (could be None if the buffer is empty)
    - the transition label (SHIFT, LEFT, RIGHT)
    '''
    s = []  #stack
    b = deque([])  #buffer

    transitions = []

    for w in sentence:
        b.append(w)

    s.append(['0', 'ROOT', '_', '_', '_', '_', '0', '_', '_', '_'])

    while len(b) > 0 or len(s) > 1:
        if s[-1][0] == '0':   # the root
            add_shift(s, b, transitions)
        elif s[-2][6] == s[-1][0] and check_rightest_arc(s[-2], b):
            add_left(s, b, transitions)
        elif s[-1][6] == s[-2][0] and (len(b) == 0 or s[-2][0] != '0') and check_rightest_arc(s[-1], b):
            add_right(s, b, transitions)
        elif len(b) == 0:
            #print("Non projective")
            return None
        else:
            add_shift(s, b, transitions)
    return transitions


def check_rightest_arc(word, b):
    '''
   w[6] is the index of the head of "this" word, so in this method we check
   if there is an arc that goes from one of the words in the buffer
   to "word" (which exists in the stack)
    '''
    for w in b:
        if w[6] == word[0]:
            return False
    return True


def add_shift(s, b, transitions):
    '''
    Adding shift transition
    '''
    word = b.popleft()
    top2 = None
    if len(s) > 1:
        top2 = s[-2]
    transitions.append([s[-1], top2, word, 'SHIFT'])
    s.append(word)


def add_left(s, b, transitions):
    '''
    Adding left transition
    '''
    top1 = s.pop()
    top2 = s.pop()
    transitions.append([top1, top2, b[0] if len(b) > 0 else None, 'LEFT'])
    s.append(top1)


def add_right(s, b, transitions):
    '''
    Adding right transition
    '''
    top1 = s.pop()
    top2 = s.pop()
    transitions.append([top1, top2, b[0] if len(b) > 0 else None, 'RIGHT'])
    s.append(top2)


In [5]:
from google.colab import drive

drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
train_path = "/content/drive/My Drive/NLP/HW4/train"
eval_path = "/content/drive/My Drive/NLP/HW4/eval"

In [0]:
from collections import Counter, defaultdict


word_idx_idx = 0
word_idx = 1
pos_tag_idx = 3
ps_tag_specific_idx = 4
head_idx = 6
dep_label_idx = 7


def get_sentences(file_path):
  sentences = []
  sentence = []
  word_current_idx = 0
  pos_current_idx = 0
  word_encoding = {}
  pos_encoding = {}
  words_pos = {}
  
  with open(file_path) as file:  
    for line in file:
      if line.strip() == '':
        sentences.append(sentence)
        sentence = []
        continue
      # line is part of a sentence - parse the line 
      splitted_line = line.split()
      lower_word = splitted_line[word_idx].lower()
      if lower_word not in word_encoding:
        word_encoding[lower_word] = word_current_idx
        word_current_idx += 1 
      pos_tag = splitted_line[pos_tag_idx]
      if lower_word not in words_pos:
        words_pos[lower_word] = pos_tag
      if pos_tag not in pos_encoding:
        pos_encoding[pos_tag] = pos_current_idx
        pos_current_idx += 1
      sentence.append(splitted_line)
      
  return sentences, word_encoding, pos_encoding, words_pos
      
      
  

In [0]:
import numpy as np

def get_one_hot(encoding, key):
    res = np.zeros(len(encoding))
    if (key == "root" or key is None):
      return res
    res[encoding[key]] = 1
    return res


In [10]:
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import gensim.downloader as api

def load(model_name):
  '''
  Downloading and loading model into memory, as a dictionary of arrays, the keys are the words.
  '''
  wv_from_bin = api.load(model_name)
  vocab = list(wv_from_bin.vocab.keys())
  print("Loaded vocab size %i" % len(vocab))
  return wv_from_bin


model = load("word2vec-google-news-300")



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Loaded vocab size 3000000


In [11]:
len_word_vec = len(model.get_vector("is"))
print(len_word_vec)

300


In [0]:
def get_word_embedding(word):
  if word == "root" or word not in model.vocab.keys():
        return np.zeros(len_word_vec)
  res = model.get_vector(word)
  return res

def get_word_from_transition(transition, index):
  try:
    if transition[index] == None:
      return None
    word = transition[index][word_idx].lower()
    return word
  except:
    print(transition)
    print(index)

def get_word_pos(words_pos, word):
  if word is None or word == "root":
    return None
  return words_pos[word]

def get_vector_rep(transitions, words_pos, word_encoding, pos_encoding):  
    # one more run on this file to create the vec representation for SVM  
    vector_rep = []
    
    for transition in transitions:        
      first_word_Q = get_word_from_transition(transition, 0)
      first_word_Q_vec = get_word_embedding(first_word_Q)
      first_word__Q_pos_vec = get_one_hot(pos_encoding, get_word_pos(words_pos, first_word_Q))
      second_word_Q = get_word_from_transition(transition, 1)
      second_word_Q_vec = get_word_embedding(second_word_Q)
      second_word_Q_pos_vec = get_one_hot(pos_encoding, get_word_pos(words_pos, second_word_Q))    
      first_word_buffer = get_word_from_transition(transition, 2)
      first_word_buffer_vec = get_word_embedding(first_word_buffer)
      first_word_buffer_pos_vec = get_one_hot(pos_encoding, get_word_pos(words_pos, first_word_buffer))      
      transition_rep = np.concatenate((first_word_Q_vec, first_word__Q_pos_vec, second_word_Q_vec, second_word_Q_pos_vec, first_word_buffer_vec, first_word_buffer_pos_vec))
      vector_rep.append(transition_rep)      
    return np.array(vector_rep)
                        
     

In [0]:
from sklearn.svm import SVC

labels_dict = {'SHIFT': 1, 'LEFT': 2, 'RIGHT': 3}

def train(train_file):
  sentences, word_encoding, pos_encoding, words_pos = get_sentences(train_file)
  print(len(sentences))
  labels_vec = []
  transitions = []
  for sentence in sentences:
      res = conll_to_transitions(sentence)
      if res is not None:
        transitions = transitions + res
        labels_vec = labels_vec + [labels_dict[transition[3]] for transition in res]
  train_vec = get_vector_rep(transitions, words_pos, word_encoding, pos_encoding)
  clf = SVC(gamma='auto')
  print("we're about to train this classifier!!!")
  clf.fit(train_vec, np.array(labels_vec))
  return clf

In [24]:
clf = train(train_path)

982
we're about to train this classifier!!!


In [0]:
def get_real_arcs(sentence):
  arcs = []
  for line in sentence:
    arcs.append([line[head_idx], line[word_idx_idx]])
  return arcs
    
    
def parser(sentence, clf, pos_encoding, words_pos2):
    s = []  #stack
    b = deque([])  #buffer
    real_arcs = get_real_arcs(sentence)
    res_arcs = []
    
    for w in sentence:
        b.append(w)

    s.append(['0', 'ROOT', '_', '_', '_', '_', '0', '_', '_', '_'])
    transitions = []
    
    while len(b) > 0 or len(s) > 1:
        if (len(b) == 0):
          break       
        if s[-1][0] == '0':   # the root
            add_shift(s, b, transitions)
            continue
        
        first_word_buffer = b[-1]
        first_word_q = s[-1]
        second_word_q = s[-2]
        t = [[first_word_q, second_word_q, first_word_buffer]]
        #word_pos = {first_word_q:first_word_q[pos_tag_idx], second_word_q:second_word_q[pos_tag_idx], first_word_buffer:first_word_buffer[pos_tag_idx]}
        input_vec = get_vector_rep(t, words_pos2, word_encoding, pos_encoding)
        action = clf.predict(input_vec)
        
        if action == 1: #shift
            add_shift(s, b, transitions)
        elif action == 2: #left 
            add_left(s, b, transitions)
            res_arcs.append([first_word_q[word_idx_idx], second_word_q[word_idx_idx]])
        elif action == 3: #right
            add_right(s, b, transitions)
            res_arcs.append([second_word_q[word_idx_idx], first_word_q[word_idx_idx]])
        else:
            print("Mew")
          
          
          
    if (len(res_arcs) == 0):
      print("No arcs")
    #compare arcs
    num_true = 0
    num_false = 0
    for arc in real_arcs:
      if arc in res_arcs:
        num_true += 1
      else:
        num_false += 1

    return res_arcs, num_true, num_false


def evaluate(eval_path, clf, pos_encoding):
  sentences, word_encoding2, pos_encoding2, words_pos2 = get_sentences(eval_path)
  
  num_true = 0
  num_false = 0
  
  for sentence in sentences:
        arcs, num_true_s, num_false_s = parser(sentence, clf, pos_encoding, words_pos2)
        num_true += num_true_s
        num_false += num_false_s

  print (f'Accuracy is: {num_true * 100/(num_true + num_false)}')

In [0]:
sentences, word_encoding, pos_encoding, words_pos = get_sentences(train_path)
print("starting evaluation")
evaluate(eval_path, clf, pos_encoding)