## Connect to Drive and importing dependencies

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
pip install swifter

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1704-9-0-local_9.0.176-1_amd64-deb
!dpkg -i cuda-repo-ubuntu1704-9-0-local_9.0.176-1_amd64-deb
!ls /var/cuda-repo-9-0-local | grep .pub
!apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
!apt-get update
!sudo apt-get install cuda-9.0

In [None]:
!pip install thundersvm

In [3]:
import pandas as pd
import numpy as np
import swifter
import warnings
import seaborn as sns
from tqdm import tqdm
warnings.filterwarnings("ignore")

from thundersvm import SVC
from thundersvm import *

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler 


pd.options.mode.chained_assignment = None

## Read the data

In [4]:
pathTrain = "/content/drive/MyDrive/NLP Study Group/Problem 4/train.dep.txt"
pathTest = "/content/drive/MyDrive/NLP Study Group/Problem 4/test.dep.txt"

In [5]:
columns = ['index','word', 'pos', 'pos1', 'unused', 'parent', 'type']

In [6]:
train = pd.read_csv(pathTrain, sep = "\t", names = columns)
train = train[['word', 'pos', 'parent', 'type']]
train.reset_index(inplace = True)
train

Unnamed: 0,index,word,pos,parent,type
0,1,in,IN,43,PP
1,2,an,DT,5,DEP
2,3,oct.,NNP,5,DEP
3,4,19,CD,5,DEP
4,5,review,NN,1,NP
...,...,...,...,...,...
5275,4,",",",",7,DEP
5276,5,'','',7,DEP
5277,6,he,PRP,7,NP-SBJ
5278,7,says,VBZ,0,ROOT


In [7]:
test = pd.read_csv(pathTest, sep = "\t", names = columns)
test = test[['word', 'pos', 'parent', 'type']]
test.reset_index(inplace = True)
test

Unnamed: 0,index,word,pos,parent,type
0,1,no,UH,4,INTJ
1,2,",",",",4,DEP
2,3,it,PRP,4,NP-SBJ
3,4,was,VBD,0,ROOT
4,5,n't,RB,4,DEP
...,...,...,...,...,...
4634,19,by,IN,6,PP
4635,20,year,NN,21,DEP
4636,21,'s,POS,22,NP
4637,22,end,NN,19,NP


## Class and Functions

In [8]:
def checkVertex(a, b, df):
  temp = df[df["wordKey"] == a]
  flag = len(temp[temp["parentKey"] == b])

  if flag is 0: 
    return False 
  else:
    return True

In [9]:
def getPOS(word, df):
  if word is not "ROOT":
    return df["pos"][df['word'] == word].values[0]
  else:
    return "NONE"

In [10]:
def getValue(key, df):
  if key is 'ROOT':
    return 'ROOT'
  else:
    return df['word'][df['wordKey'] == key].values[0] 

In [11]:
def dataFramePerSentence(dframe, numbSentence):
  tempList = []
  startPoint = 0

  for i in range(numbSentence):
    temp = dframe[startPoint:].copy()  
    temp.reset_index(inplace = True)
    temp['level_0'] = temp['level_0'] - temp['level_0'][0] + 1
    temp = temp[temp['level_0'] == temp['index']]
    startPoint = startPoint + len(temp)

    temp.drop(['index','level_0'], axis=1, inplace = True)

    temp.parent = temp.parent - 1
    temp['parentKey'] = temp.parent

    for index, row in temp.iterrows():
      if row.parent is -1: 
        temp['parent'][index] = 'ROOT'
      elif not isinstance(temp['parent'][index], str):
        key = temp['parent'][index]
        temp['parent'][index] = temp['word'][key]

    temp['wordKey'] = temp.index
    tempList.append(temp) 

  return tempList

In [12]:
def generateFeatures(dfList, printStep = False):
  featuresHolder = []

  for df in dfList:
    buffer = list(df['wordKey'])
    stack = ['ROOT']

    while not(len(buffer) is 0 and len(stack) is 2):   
      if len(stack) > 2:
        if checkVertex(stack[-2], stack[-1], df):  #left
            if printStep: print("⦿LEFT: ",stack[-2]," <-- ", stack[-1])
            flag = "left"
            stack.pop(-2)
        elif checkVertex(stack[-1], stack[-2], df):  #right
          countRemainingDependant = 0

          for remainingWord in buffer:
            if checkVertex(remainingWord, stack[-1], df) == True:
              countRemainingDependant += 1
            if checkVertex(stack[-1], remainingWord, df) == True:
              countRemainingDependant += 1         

          if countRemainingDependant is not 0:
            stack.append(buffer.pop(0))
            flag = "shift"
            if printStep: print("⦿shift right")
          else:
            if printStep: print("⦿RIGHT: ",stack[-2]," --> ", stack[-1])    
            flag = "right"
            stack.pop(-1)

        else:
          if printStep: print("⦿shift inner")
          flag = "shift"
          stack.append(buffer.pop(0))
      else:
        if printStep: print("⦿shift outer")
        flag = "shift"
        stack.append(buffer.pop(0))
                  
      if printStep: print("stack: ", stack)
      if printStep: print("buffer: ", buffer)
      if printStep: print("==================")


      stack1 = getValue(stack[-1], df)
      if len(stack) > 1:
        stack2 = getValue(stack[-2], df)
        if len(buffer):
          buffer1 = getValue(buffer[0], df)
          featuresHolder.append([stack2, stack1, buffer1,
                          getPOS(stack2, df), getPOS(stack1, df), getPOS(buffer1, df),
                          len(stack), len(buffer), flag])
        else:
          featuresHolder.append([stack2, stack1, 'EMPTY-BUFFER', 
                          getPOS(stack2, df), getPOS(stack1, df), 'EMPTY-BUFFER',
                          len(stack), len(buffer), flag])
  
  dfFeatures = pd.DataFrame(featuresHolder, columns =['stack0', 'stack1', 'buffer0', 'posstack0', 'posstack1', 'posbuffer0', 'lenstack', 'lenbuffer', 'target'])
  shiftTarget = dfFeatures.target.values
  shiftTarget = np.delete(shiftTarget, 0)
  dfFeatures.drop(dfFeatures.tail(1).index,inplace=True)
  dfFeatures.target = shiftTarget
  return dfFeatures

In [13]:
class ShiftReduce:
  def __init__(self):
    self._model = None

  def process(self, dfList, model, encoder):
    #left = 0
    #right = 1
    #shift = 2
    self._encoder = encoder
    self._model = model
    dfPredictedList = []

    for df in tqdm(dfList):
      dfPredicted = df.copy()
      dfPredicted.parentKey = -1
      buffer = list(df['wordKey'])
      stack = ['ROOT']
      stepList = []

      while not(len(buffer) is 0 and len(stack) is 2):  
        currentStep = None

        if len(stack) is 1:
          currentStep = 2
          stack.append(buffer.pop(0))
        else:
          featuresHolder = []
          stack1 = getValue(stack[-1], df)

          stack2 = getValue(stack[-2], df)
          if len(buffer):
            buffer1 = getValue(buffer[0], df)
            featuresHolder.append([stack2, stack1, buffer1,
                            getPOS(stack2, df), getPOS(stack1, df), getPOS(buffer1, df)])
          else:
            featuresHolder.append([stack2, stack1, 'EMPTY-BUFFER', 
                            getPOS(stack2, df), getPOS(stack1, df), 'EMPTY-BUFFER'])

          
          encodedFeatures = encoder.transform(featuresHolder).toarray()
          encodedFeatures = [np.concatenate((encodedFeatures[0], np.array([len(stack), len(buffer)])), axis = 0)]

          currentStep = model.predict(encodedFeatures)

          wordKey = None
          parentKey = None
          flag = None
          if currentStep == 2: #shift
            stack.append(buffer.pop(0))
          elif currentStep == 1: #right
            wordKey = stack[-1]
            parentKey = stack[-2]
            stack.pop(-1)
          elif currentStep == 0: #left
            wordKey = stack[-2]
            parentKey = stack[-1]  
            stack.pop(-2)

          if wordKey is not None and parentKey is not None:
            dfPredicted['parentKey'][dfPredicted['wordKey'] == wordKey] = parentKey
          
      dfPredictedList.append(dfPredicted) 
    
    return dfPredictedList

## Get dataframe for every sentence

In [14]:
dfTrainList = dataFramePerSentence(train, 200)
dfTestList = dataFramePerSentence(test, 200)

In [15]:
dfTrainList[1]

Unnamed: 0,word,pos,parent,type,parentKey,wordKey
0,ms.,NNP,haag,DEP,1,0
1,haag,NNP,plays,NP-SBJ,2,1
2,plays,VBZ,ROOT,ROOT,-1,2
3,elianti,NNP,plays,NP-OBJ,2,3
4,.,.,plays,DEP,2,4


In [16]:
dfTestList[0]

Unnamed: 0,word,pos,parent,type,parentKey,wordKey
0,no,UH,was,INTJ,3,0
1,",",",",was,DEP,3,1
2,it,PRP,was,NP-SBJ,3,2
3,was,VBD,ROOT,ROOT,-1,3
4,n't,RB,was,DEP,3,4
5,black,JJ,monday,DEP,6,5
6,monday,NNP,was,NP-PRD,3,6
7,.,.,was,DEP,3,7


## Check generateFeatures function

In [105]:
feat = generateFeatures(dfTestList[170:171], True)
feat

⦿shift outer
stack:  ['ROOT', 0]
buffer:  [1, 2, 3, 4, 5, 6, 7, 8]
⦿shift outer
stack:  ['ROOT', 0, 1]
buffer:  [2, 3, 4, 5, 6, 7, 8]
⦿LEFT:  0  <--  1
stack:  ['ROOT', 1]
buffer:  [2, 3, 4, 5, 6, 7, 8]
⦿shift outer
stack:  ['ROOT', 1, 2]
buffer:  [3, 4, 5, 6, 7, 8]
⦿shift inner
stack:  ['ROOT', 1, 2, 3]
buffer:  [4, 5, 6, 7, 8]
⦿LEFT:  2  <--  3
stack:  ['ROOT', 1, 3]
buffer:  [4, 5, 6, 7, 8]
⦿LEFT:  1  <--  3
stack:  ['ROOT', 3]
buffer:  [4, 5, 6, 7, 8]
⦿shift outer
stack:  ['ROOT', 3, 4]
buffer:  [5, 6, 7, 8]
⦿shift right
stack:  ['ROOT', 3, 4, 5]
buffer:  [6, 7, 8]
⦿shift right
stack:  ['ROOT', 3, 4, 5, 6]
buffer:  [7, 8]
⦿shift inner
stack:  ['ROOT', 3, 4, 5, 6, 7]
buffer:  [8]
⦿LEFT:  6  <--  7
stack:  ['ROOT', 3, 4, 5, 7]
buffer:  [8]
⦿RIGHT:  5  -->  7
stack:  ['ROOT', 3, 4, 5]
buffer:  [8]
⦿RIGHT:  4  -->  5
stack:  ['ROOT', 3, 4]
buffer:  [8]
⦿RIGHT:  3  -->  4
stack:  ['ROOT', 3]
buffer:  [8]
⦿shift outer
stack:  ['ROOT', 3, 8]
buffer:  []
⦿RIGHT:  3  -->  8
stack:  ['ROOT',

Unnamed: 0,stack0,stack1,buffer0,posstack0,posstack1,posbuffer0,lenstack,lenbuffer,target
0,ROOT,the,centers,NONE,DT,NNS,2,8,shift
1,the,centers,normally,DT,NNS,RB,3,7,left
2,ROOT,centers,normally,NONE,NNS,RB,2,7,shift
3,centers,normally,are,NNS,RB,VBP,3,6,shift
4,normally,are,closed,RB,VBP,VBN,4,5,left
5,centers,are,closed,NNS,VBP,VBN,3,5,left
6,ROOT,are,closed,NONE,VBP,VBN,2,5,shift
7,are,closed,through,VBP,VBN,IN,3,4,shift
8,closed,through,the,VBN,IN,DT,4,3,shift
9,through,the,weekend,IN,DT,NN,5,2,shift


## Generate features for Training

In [17]:
trainFeatures = generateFeatures(dfTrainList, False)

In [18]:
trainFeatures

Unnamed: 0,stack0,stack1,buffer0,posstack0,posstack1,posbuffer0,lenstack,lenbuffer,target
0,ROOT,in,an,NONE,IN,DT,2,48,shift
1,in,an,oct.,IN,DT,NNP,3,47,shift
2,an,oct.,19,DT,NNP,CD,4,46,shift
3,oct.,19,review,NNP,CD,NN,5,45,shift
4,19,review,of,CD,NN,IN,6,44,left
...,...,...,...,...,...,...,...,...,...
10354,",",says,.,",",VBZ,.,5,1,left
10355,kiddies,says,.,NNS,VBZ,.,4,1,left
10356,``,says,.,``,VBZ,.,3,1,left
10357,ROOT,says,.,NONE,VBZ,.,2,1,shift


In [19]:
trainFeatures['target'].value_counts()

shift    5279
left     2627
right    2453
Name: target, dtype: int64

In [20]:
featureEncoder = OneHotEncoder(handle_unknown='ignore')
targetEncoder = OneHotEncoder(handle_unknown='ignore')
cols = ['stack0',	'stack1',	'buffer0',	'posstack0',	'posstack1',	'posbuffer0']
additionalCols = ['lenstack', 'lenbuffer']

featureEncoder.fit(trainFeatures[cols])
targetEncoder.fit(trainFeatures['target'].values.reshape(-1,1))

xtrain = featureEncoder.transform(trainFeatures[cols]).toarray()
xtrain = np.concatenate((xtrain, trainFeatures[['lenstack', 'lenbuffer']].to_numpy()), axis = 1)
ytrain = targetEncoder.transform(trainFeatures['target'].values.reshape(-1,1)).toarray()

In [21]:
sampler = RandomUnderSampler(random_state=42)
xtrain, ytrain = sampler.fit_resample(xtrain, ytrain)

In [22]:
print((np.argmax(ytrain, axis=1) == 0).sum())
print((np.argmax(ytrain, axis=1) == 1).sum())
print((np.argmax(ytrain, axis=1) == 2).sum())

2453
2453
2453


## Train SVM

In [119]:
param_gamma = [0.01, 0.05, 0.1, 0.5]
param_c= [1, 2, 4, 8, 10]

for c in range(len(param_c)):
  for gm in range(len(param_gamma)):
    svm = SVC(
        kernel = "rbf",
        gamma = param_gamma[gm],
        C = param_c[c],
        #cache_size = 4000,
        n_jobs = -1,
        max_mem_size = 8000
    )

    scores = cross_val_score(svm, xtrain, np.argmax(ytrain, axis = 1), cv=5)
    print("C: ",param_c[c]," gamma: ",param_gamma[gm]," accuracy: ",scores.mean())

C:  1  gamma:  0.01  accuracy:  0.7778235191972336
C:  1  gamma:  0.05  accuracy:  0.794402192386132
C:  1  gamma:  0.1  accuracy:  0.7925004803002986
C:  1  gamma:  0.5  accuracy:  0.7079779726893861
C:  2  gamma:  0.01  accuracy:  0.8032350996068927
C:  2  gamma:  0.05  accuracy:  0.8169601424644577
C:  2  gamma:  0.1  accuracy:  0.808806213608016
C:  2  gamma:  0.5  accuracy:  0.7169466571099223
C:  4  gamma:  0.01  accuracy:  0.8287828266780952
C:  4  gamma:  0.05  accuracy:  0.8241618759790736
C:  4  gamma:  0.1  accuracy:  0.8086710829663346
C:  4  gamma:  0.5  accuracy:  0.7169466571099223
C:  8  gamma:  0.01  accuracy:  0.8449531522478054
C:  8  gamma:  0.05  accuracy:  0.8240267453373924
C:  8  gamma:  0.1  accuracy:  0.8090790611237548
C:  8  gamma:  0.5  accuracy:  0.7169466571099223
C:  10  gamma:  0.01  accuracy:  0.8456327771702185
C:  10  gamma:  0.05  accuracy:  0.8229396964502114
C:  10  gamma:  0.1  accuracy:  0.8090790611237548
C:  10  gamma:  0.5  accuracy:  0.71694

In [132]:
svm = SVC(
    kernel = "rbf",
    gamma = 0.01,
    C = 10,
    #cache_size = 4000,
    n_jobs = -1,
    max_mem_size = 8000
)

svm.fit(xtrain, np.argmax(ytrain, axis = 1))

## Test and implement Shift-Reduce

In [None]:
sr = ShiftReduce()
dfPredicted = sr.process(dfTestList, svm, featureEncoder)

In [221]:
arrPred = np.array([])
arrActual = np.array([])

for i in range(len(dfPredicted)):
  arrPred = np.concatenate((arrPred, np.array(dfPredicted[i].parentKey)))
  arrActual = np.concatenate((arrActual, np.array(dfTestList[i].parentKey)))

In [222]:
correct = (arrActual == arrPred)
correct.sum() / correct.size

0.6460444061220091