## Connect to Drive and importing dependencies

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
pip install swifter

Collecting swifter
  Downloading swifter-1.1.2.tar.gz (633 kB)
[?25l[K     |▌                               | 10 kB 19.1 MB/s eta 0:00:01[K     |█                               | 20 kB 24.0 MB/s eta 0:00:01[K     |█▌                              | 30 kB 11.6 MB/s eta 0:00:01[K     |██                              | 40 kB 10.3 MB/s eta 0:00:01[K     |██▋                             | 51 kB 5.5 MB/s eta 0:00:01[K     |███                             | 61 kB 5.6 MB/s eta 0:00:01[K     |███▋                            | 71 kB 5.8 MB/s eta 0:00:01[K     |████▏                           | 81 kB 6.5 MB/s eta 0:00:01[K     |████▋                           | 92 kB 6.8 MB/s eta 0:00:01[K     |█████▏                          | 102 kB 5.2 MB/s eta 0:00:01[K     |█████▊                          | 112 kB 5.2 MB/s eta 0:00:01[K     |██████▏                         | 122 kB 5.2 MB/s eta 0:00:01[K     |██████▊                         | 133 kB 5.2 MB/s eta 0:00:01[K     |███

In [3]:
import pandas as pd
import numpy as np
import swifter
import re
from sklearn.model_selection import KFold
import statistics
import warnings
import seaborn as sns
from tqdm import tqdm
warnings.filterwarnings("ignore")

pd.options.mode.chained_assignment = None

## Read the data

In [4]:
pathTrain = "/content/drive/MyDrive/NLP Study Group/Problem 4/train.dep.txt"
pathTest = "/content/drive/MyDrive/NLP Study Group/Problem 4/test.dep.txt"

In [5]:
columns = ['index','word', 'pos', 'pos1', 'unused', 'parent', 'type']

In [6]:
train = pd.read_csv(pathTrain, sep = "\t", names = columns)
train = train[['word', 'pos', 'parent', 'type']]
train.reset_index(inplace = True)
train

Unnamed: 0,index,word,pos,parent,type
0,1,in,IN,43,PP
1,2,an,DT,5,DEP
2,3,oct.,NNP,5,DEP
3,4,19,CD,5,DEP
4,5,review,NN,1,NP
...,...,...,...,...,...
5275,4,",",",",7,DEP
5276,5,'','',7,DEP
5277,6,he,PRP,7,NP-SBJ
5278,7,says,VBZ,0,ROOT


In [7]:
test = pd.read_csv(pathTest, sep = "\t", names = columns)
test = test[['word', 'pos', 'parent', 'type']]
test.reset_index(inplace = True)
test

Unnamed: 0,index,word,pos,parent,type
0,1,no,UH,4,INTJ
1,2,",",",",4,DEP
2,3,it,PRP,4,NP-SBJ
3,4,was,VBD,0,ROOT
4,5,n't,RB,4,DEP
...,...,...,...,...,...
4634,19,by,IN,6,PP
4635,20,year,NN,21,DEP
4636,21,'s,POS,22,NP
4637,22,end,NN,19,NP


## Class and Functions

In [8]:
def dataFramePerSentence(dframe, numbSentence):
  tempList = []
  startPoint = 0

  for i in range(numbSentence):
    temp = dframe[startPoint:].copy()  
    temp.reset_index(inplace = True)
    temp['level_0'] = temp['level_0'] - temp['level_0'][0] + 1
    temp = temp[temp['level_0'] == temp['index']]
    startPoint = startPoint + len(temp)
    temp.drop(['level_0', 'index'], axis=1, inplace = True)

    temp.parent = temp.parent - 1

    for index, row in temp.iterrows():
      if row.parent is -1: 
        temp['parent'][index] = 'ROOT'
      elif not isinstance(temp['parent'][index], str):
        key = temp['parent'][index]
        temp['parent'][index] = temp['word'][key]

    tempList.append(temp) 

  return tempList

In [None]:
class Perceptron:
  def __init__(self):
    self._weights = []
    self._learningRate = 0.01
    self._epoch = 10
    self._x = []
    self._y = []  

  def linearFunction(self, data, weights):
    return np.dot(data, weights)
  
  def activationFunction(self, linearResult):
    return 1 / ( 1 + np.exp(-(linearResult)))

In [None]:
class ShiftReduce:
  def __init__(self):
    self._sentence = None
    self._stack = None

  def process(self, sentence, model):
    self.sentence = sentence
    self._stack = ['ROOT']

    while len(self._sentence) > 0 and len(self._stack) > 1:
      # if model.predict() == 'shift'
      #   shift()
      # elif model.predict() == 'left'
      #   left()
      # elif model.predict() == 'right'
      #   right()

  #def shift(self)
  #def left(self)
  #def right(self)

## Get dataframe for every sentence

In [9]:
dfTrainList = dataFramePerSentence(train, 200)
dfTestList = dataFramePerSentence(test, 200)

In [10]:
dfTrainList[1]

Unnamed: 0,word,pos,parent,type
0,ms.,NNP,haag,DEP
1,haag,NNP,plays,NP-SBJ
2,plays,VBZ,ROOT,ROOT
3,elianti,NNP,plays,NP-OBJ
4,.,.,plays,DEP


In [None]:
dfTestList[0]

Unnamed: 0,word,pos,parent,type
0,no,UH,was,INTJ
1,",",",",was,DEP
2,it,PRP,was,NP-SBJ
3,was,VBD,ROOT,ROOT
4,n't,RB,was,DEP
5,black,JJ,monday,DEP
6,monday,NNP,was,NP-PRD
7,.,.,was,DEP


In [77]:
listw = list(dfTestList[0]['word'])
listp = list(dfTestList[0]['parent'])
listpos = list(dfTestList[0]['pos'])
listtype = list(dfTestList[0]['type'])

In [30]:
listw

['no', ',', 'it', 'was', "n't", 'black', 'monday', '.']

In [31]:
listp

['was', 'was', 'was', 'ROOT', 'was', 'monday', 'was', 'was']

In [61]:
def checkVertex(a, b, df):
  temp = df[df["word"] == a]

  flag = len(temp[temp["parent"] == b])
  #display(temp[temp["parent"] == b])

  if flag is 0: 
    return False 
  else:
    return True

In [89]:
def getPOS(word, df):
  if word is not "ROOT":
    return df["pos"][df['word'] == word].values[0]
  else:
    return "NONE"

In [87]:
getPOS('was', dfTestList[0])

'VBD'

In [58]:
checkVertex('was', 'it', dfTestList[0])

Unnamed: 0,word,pos,parent,type


False

In [202]:
stack = ['ROOT']
buffer = listw.copy()
# take end of stack and top of buffer

feature = []

counter = 0

while len(stack) + len(buffer) is not 2:
  if len(stack) == 1:
    print("shift")
    flag = "shift"
    stack.append(buffer.pop(0))
  elif checkVertex(stack[-1], buffer[0], dfTestList[0]):
      print("left")
      flag = "left"
      stack.pop(-1)
  elif checkVertex(buffer[0], stack[-1], dfTestList[0]):
      print("right")     
      flag = "right"
      buffer.pop(0)
      buffer = [stack.pop(-1)] + buffer
  else:
    print("shift")
    flag = "shift"
    stack.append(buffer.pop(0))

  if len(stack) > 1:
    feature.append([stack[-2], 
                    stack[-1], 
                    buffer[0], 
                    getPOS(stack[-2], dfTestList[0]),
                    getPOS(stack[-1], dfTestList[0]),
                    getPOS(buffer[0], dfTestList[0]),
                    flag])
  else:
     feature.append(['ROOT', 
                    stack[-1], 
                    buffer[0], 
                    getPOS('ROOT', dfTestList[0]),
                    getPOS(stack[-1], dfTestList[0]),
                    getPOS(buffer[0], dfTestList[0]),
                    flag])
  
  print("stack: ", stack)
  print("buffer: ", buffer)
  print("==================")

print(feature)
  
  

shift
stack:  ['ROOT', 'no']
buffer:  [',', 'it', 'was', "n't", 'black', 'monday', '.']
shift
stack:  ['ROOT', 'no', ',']
buffer:  ['it', 'was', "n't", 'black', 'monday', '.']
shift
stack:  ['ROOT', 'no', ',', 'it']
buffer:  ['was', "n't", 'black', 'monday', '.']
left
stack:  ['ROOT', 'no', ',']
buffer:  ['was', "n't", 'black', 'monday', '.']
left
stack:  ['ROOT', 'no']
buffer:  ['was', "n't", 'black', 'monday', '.']
left
stack:  ['ROOT']
buffer:  ['was', "n't", 'black', 'monday', '.']
shift
stack:  ['ROOT', 'was']
buffer:  ["n't", 'black', 'monday', '.']
right
stack:  ['ROOT']
buffer:  ['was', 'black', 'monday', '.']
shift
stack:  ['ROOT', 'was']
buffer:  ['black', 'monday', '.']
shift
stack:  ['ROOT', 'was', 'black']
buffer:  ['monday', '.']
left
stack:  ['ROOT', 'was']
buffer:  ['monday', '.']
right
stack:  ['ROOT']
buffer:  ['was', '.']
shift
stack:  ['ROOT', 'was']
buffer:  ['.']
right
stack:  ['ROOT']
buffer:  ['was']
[['ROOT', 'no', ',', 'NONE', 'UH', ',', 'shift'], ['no', ',', 

In [156]:
def generateFeatures(dfList):
  featuresHolder = []

  for df in dfList:
    listw = list(df['word'])
    listp = list(df['parent'])
    listpos = list(df['pos'])
    listtype = list(df['type'])

    stack = ['ROOT']
    buffer = listw.copy()

    while len(stack) + len(buffer) is not 2:
      if len(stack) == 1:
        print("shift")
        flag = "shift"
        stack.append(buffer.pop(0))
      #elif len(stack) == 2 and checkVertex(stack[1], stack[0], df):
      #  print("shift")
      #  flag = "shift"
      #  stack.append(buffer.pop(0))        
      elif checkVertex(stack[-1], buffer[0], df):
        print("left")
        flag = "left"
        stack.pop(-1)
      elif checkVertex(buffer[0], stack[-1], df):
        print("right")     
        flag = "right"
        buffer.pop(0)
        buffer = [stack.pop(-1)] + buffer
      else:
        print("shift")
        flag = "shift"
        stack.append(buffer.pop(0))

      #if len(stack) > 1:
      #  featuresHolder.append([stack[-2], 
      #                  stack[-1], 
      #                  buffer[0], 
      #                  getPOS(stack[-2], df),
      #                  getPOS(stack[-1], df),
      #                  getPOS(buffer[0], df),
      #                  flag])
      #else:
      #  featuresHolder.append(['ROOT', 
      #                  stack[-1], 
      #                  buffer[0], 
      #                  getPOS('ROOT', df),
      #                  getPOS(stack[-1], df),
      #                  getPOS(buffer[0], df),
      #                  flag])
      
      print("stack: ", stack)
      print("buffer: ", buffer)
      print("==================")

  print(featuresHolder)
  
  

In [239]:
def generateFeatures(dfList):
  featuresHolder = []

  for df in dfList:
    listw = list(df['word'])
    listp = list(df['parent'])
    listpos = list(df['pos'])
    listtype = list(df['type'])

    display(df)

    stack = ['ROOT']
    buffer = listw.copy()

    while len(stack) + len(buffer) is not 2:
      if len(stack) == 1:
        print("shift")
        flag = "shift"
        stack.append(buffer.pop(0))
      elif checkVertex(stack[-2], stack[-1], df):
        print("left")
        flag = "left"
        stack.pop(-2)
      elif checkVertex(stack[-1], stack[-2], df):
        if stack[-2] is 'ROOT':
          print("spec.left")
          buffer.append(stack.pop(-1))
          #buffer.insert(1,stack.pop(-1))
        else:
          print("right")     
          flag = "right"
          stack.pop(-1)
      else:
        print("shift")
        flag = "shift"
        stack.append(buffer.pop(0))

      #if len(stack) > 1:
      #  featuresHolder.append([stack[-2], 
      #                  stack[-1], 
      #                  buffer[0], 
      #                  getPOS(stack[-2], df),
      #                  getPOS(stack[-1], df),
      #                  getPOS(buffer[0], df),
      #                  flag])
      #else:
      #  featuresHolder.append(['ROOT', 
      #                  stack[-1], 
      #                  buffer[0], 
      #                  getPOS('ROOT', df),
      #                  getPOS(stack[-1], df),
      #                  getPOS(buffer[0], df),
      #                  flag])
      
      print("stack: ", stack)
      print("buffer: ", buffer)
      print("==================")

  print(featuresHolder)
  
  

In [226]:
dfTestList[166]

Unnamed: 0,word,pos,parent,type
0,trading,NN,volume,DEP
1,volume,NN,was,NP-SBJ
2,was,VBD,ROOT,ROOT
3,only,RB,was,ADJP
4,modestly,RB,only,DEP
5,higher,JJR,only,DEP
6,than,IN,only,PP
7,normal,JJ,than,NP
8,.,.,was,DEP


In [255]:
generateFeatures( dfTestList[166:167])

Unnamed: 0,word,pos,parent,type
0,trading,NN,volume,DEP
1,volume,NN,was,NP-SBJ
2,was,VBD,ROOT,ROOT
3,only,RB,was,ADJP
4,modestly,RB,only,DEP
5,higher,JJR,only,DEP
6,than,IN,only,PP
7,normal,JJ,than,NP
8,.,.,was,DEP


shift
stack:  ['ROOT', 'trading']
buffer:  ['volume', 'was', 'only', 'modestly', 'higher', 'than', 'normal', '.']
shift
stack:  ['ROOT', 'trading', 'volume']
buffer:  ['was', 'only', 'modestly', 'higher', 'than', 'normal', '.']
left
stack:  ['ROOT', 'volume']
buffer:  ['was', 'only', 'modestly', 'higher', 'than', 'normal', '.']
shift
stack:  ['ROOT', 'volume', 'was']
buffer:  ['only', 'modestly', 'higher', 'than', 'normal', '.']
left
stack:  ['ROOT', 'was']
buffer:  ['only', 'modestly', 'higher', 'than', 'normal', '.']
spec.left
stack:  ['ROOT', 'was', 'only', 'modestly']
buffer:  ['higher', 'than', 'normal', '.']
right
stack:  ['ROOT', 'was', 'only']
buffer:  ['higher', 'than', 'normal', '.']
right
stack:  ['ROOT', 'was']
buffer:  ['higher', 'than', 'normal', '.']
shift
stack:  ['ROOT', 'was', 'higher']
buffer:  ['than', 'normal', '.']
shift
stack:  ['ROOT', 'was', 'higher', 'than']
buffer:  ['normal', '.']
shift
stack:  ['ROOT', 'was', 'higher', 'than', 'normal']
buffer:  ['.']
right

IndexError: ignored

In [254]:
def generateFeatures(dfList):
  featuresHolder = []

  for df in dfList:
    listw = list(df['word'])
    listp = list(df['parent'])
    listpos = list(df['pos'])
    listtype = list(df['type'])

    display(df)

    stack = ['ROOT']
    buffer = listw.copy()
    ROOT = ""

    ranger = 0

    while len(stack) + len(buffer) is not 2:
      if len(stack) == ranger + 1:
        print("shift")
        flag = "shift"
        stack.append(buffer.pop(0))
      elif checkVertex(stack[-2], stack[-1], df):
        print("left")
        flag = "left"
        stack.pop(-2)
      elif checkVertex(stack[-1], stack[-2], df):
        if stack[-2] is 'ROOT':
          print("spec.left")
          #buffer.append(stack.pop(-1))
          ROOT = stack[-1]
          ranger = ranger + 1
          stack.append(buffer.pop(0))
          stack.append(buffer.pop(0))
          #buffer.insert(1,stack.pop(-1))
        else:
          print("right")     
          flag = "right"
          stack.pop(-1)
      else:
        print("shift")
        flag = "shift"
        stack.append(buffer.pop(0))

      #if len(stack) > 1:
      #  featuresHolder.append([stack[-2], 
      #                  stack[-1], 
      #                  buffer[0], 
      #                  getPOS(stack[-2], df),
      #                  getPOS(stack[-1], df),
      #                  getPOS(buffer[0], df),
      #                  flag])
      #else:
      #  featuresHolder.append(['ROOT', 
      #                  stack[-1], 
      #                  buffer[0], 
      #                  getPOS('ROOT', df),
      #                  getPOS(stack[-1], df),
      #                  getPOS(buffer[0], df),
      #                  flag])
      
      print("stack: ", stack)
      print("buffer: ", buffer)
      print("==================")

  print(featuresHolder)
  
  