# GetData

In [12]:
import xml.dom.minidom
import pandas as pd

In [13]:
def extractData(file_path):
    """
    extract data from xml file
    """
    
    DOMTree = xml.dom.minidom.parse(file_path)
    collection = DOMTree.documentElement
    
    data = []
    sents = collection.getElementsByTagName("sentence") 
    for sent in sents:
      aspectTerms = sent.getElementsByTagName('aspectTerms')
      if len(list(aspectTerms)):
        
        text = sent.getElementsByTagName("text")[0]
        temp = text.childNodes[0].data
    
        aspectTerm = aspectTerms[0].getElementsByTagName("aspectTerm")
        for ap in aspectTerm:
          content = []
          content.append(temp)
          content.append(ap.getAttribute("term"))
          content.append(ap.getAttribute("polarity"))
          data.append(content)
    
    df = pd.DataFrame(data,columns=['text','target','label'])
    df = df[df['label'] != 'conflict']
    
    return df

def saveData(input_path,save_path):
    df = extractData(input_path)
    df.to_csv(save_path,index=False)
    return df

In [14]:
train_df = saveData('./DATA/Restaurants_Train.xml','./DATA/train.csv')
test_df = saveData('./DATA/restaurants-trial.xml','./DATA/test.csv')
train_df.head()

Unnamed: 0,text,target,label
0,But the staff was so horrible to us.,staff,negative
1,"To be completely fair, the only redeeming fact...",food,positive
2,"The food is uniformly exceptional, with a very...",food,positive
3,"The food is uniformly exceptional, with a very...",kitchen,positive
4,"The food is uniformly exceptional, with a very...",menu,neutral


# Pre

In [15]:
import pandas as pd
import nltk
from enchant.checker import SpellChecker
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import keras
import numpy as np
import os

In [16]:
import re
sent = 'food, ou,.()asdtsta-nssd\'    din\"  g lit*tle \'perks\' great 39pm 123sd23'
print(sent)

sent = re.sub(r'[0-9]','',sent)
sent = re.sub(r'[0-9,:?.\'\"!()*-]',' ',sent)
sent = re.sub(r'  +',' ',sent)
sent

food, ou,.()asdtsta-nssd'    din"  g lit*tle 'perks' great 39pm 123sd23


'food ou asdtsta nssd din g lit tle perks great pm sd'

In [17]:
def cutWords(content):
    sents = nltk.sent_tokenize(content)
    word = []
    for sent in sents:
        word.extend(nltk.word_tokenize(sent))

    return word

    

chkr = SpellChecker("en_US")
# stop_words = stopwords.words('english')
# for w in ['!',',','.','?','-s','-ly','</s>','s','(',')','\'','\"','\'']:
#     stop_words.append(w)

stop_words = []
stop_words.extend(['!',',','.','?','-s','-ly','</s>','s','(',')','\'','\"','-'])



wnl = WordNetLemmatizer()


def pre(sent):
    # To low case
    sent = sent.lower()
    
    sent = sent.replace('\'t', ' not')
    sent = sent.replace('whats', 'what is')
    sent = sent.replace('\'ve', ' have')
    sent = sent.replace('\'m', ' am')
    sent = sent.replace('\'ll', ' will')
    sent = re.sub(r'[0-9,:?.\'\"!()*-]',' ',sent)
    sent = re.sub(r'  +',' ',sent)


    
    
    
    # Spell check
    chkr.set_text(sent)
    
    for err in chkr:
        try:
            sent = sent.replace(err.word,chkr.suggest(err.word)[0])
        except IndexError:
            continue
    
    word_list = cutWords(sent)
    # filter stop words        
#     filtered_words = [word for word in word_list if word not in stop_words]
    
    #Lemmatization
    lwords = []
    for w in word_list:
        lwords.append(wnl.lemmatize(w))  
    
    
    return str(" ".join(lwords))



def getLabel(result):
    if result == 'positive':
        return 1
    elif result == 'neutral':
        return 0
    elif result == 'negative':
        return -1
    else:
        print(result)
        print('error type')
        exit(1)
  
def savePreData(input_path,save_file_name):
    
#     dir_path = './DATA/{}'.format(save_file_name)
#     if not os.path.exists(dir_path): 
#         os.makedirs(dir_path) 
        
#     x_path = './DATA/{}/{}_x.npy'.format(save_file_name,save_file_name)
#     target_path = './DATA/{}/{}_target.npy'.format(save_file_name,save_file_name)
#     label_path = './DATA/{}/{}_label.npy'.format(save_file_name,save_file_name)

    train_df = pd.read_csv(input_path,encoding='utf-8')

    train_label = train_df.label    
    train_label = train_label.apply(getLabel)
    train_label = keras.utils.to_categorical(train_label, num_classes=3)
    
#     train_text = train_df.text
#     train_target = train_df.target
#     train_x = train_text.apply(pre2)
#     train_target = train_target.apply(pre2)

    train_df['text'] = train_df['text'].apply(pre)
    train_df['target'] = train_df['target'].apply(pre)

        

#     np.save(x_path,train_x)
#     np.save(target_path,train_target)
#     np.save(label_path,train_label)

    df_path = './DATA/pre_{}.csv'.format(save_file_name)
    train_df.to_csv(df_path,index=False)
    return train_df

In [18]:
pre_train_df = savePreData('./DATA/train.csv','train')
pre_test_df = savePreData('./DATA/test.csv','test')
pre_train_df.head()

Unnamed: 0,text,target,label
0,but the staff wa so horrible to u,staff,negative
1,to be completely fair the only redeeming facto...,food,positive
2,the food is uniformly exceptional with a very ...,food,positive
3,the food is uniformly exceptional with a very ...,kitchen,positive
4,the food is uniformly exceptional with a very ...,menu,neutral


In [19]:
pre_train_df.iloc[5,:]

text      not only wa the food outstanding but the littl...
target                                                 food
label                                              positive
Name: 5, dtype: object

# LSK

In [20]:
import spacy
import nltk
import pandas as pd
import numpy as np

import pandas as pd
import nltk
from enchant.checker import SpellChecker
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import keras
import numpy as np
import os

In [24]:
nlp = spacy.load('en_core_web_sm')
  
# sent = "The service is absolutely terrible"
# target = 'service'

# def strPre(sent):
#      # To low case
#     sent = sent.lower()
    
#     # Spell check
#     chkr.set_text(sent)
    
#     for err in chkr:
#         try:
#             sent = sent.replace(err.word,chkr.suggest(err.word)[0])
#         except IndexError:
#             continue
#     return sent
LSK = []

def getLSK(sent,target):
    
    # sent = strPre(sent)
    # target = strPre(target)

    words = sent.split(" ")

    sent_len = len(words)

    distance = np.zeros((sent_len,sent_len))

    # sent = unicode(sent, "utf-8")
    

    doc = nlp(sent)
    for token in doc:
    #     print(token.text, token.head.text,
    #           [child for child in token.children])
        try:
            index1 = words.index(token.text)
        except ValueError:
            print(sent)
            print(target)
            print('token: %s not find in snetence\n'%(token))
            continue
            
        try:
            index2 = words.index(token.head.text)
            if token.text != token.head.text:
                distance[index1,index2] = 1
        except ValueError:
            print(sent)
            print(target)
            print('token head: %s not find in sentence\n'%(token.head.text))
            continue
            

        for child in token.children:
            try:
                index3 = words.index(str(child))
                distance[index1,index3] = 1
            except ValueError:
                print(sent)
                print(target)
                print('%s\'s child: %s not find\n'%(token,child))
                continue
    # print distance

    lsk = np.zeros(sent_len)

    all_target = str(target).split(" ")
    for t in all_target:
        try:
            row = words.index(t)
        except ValueError:
            continue
        lsk = distance[row,:]
        for i,value in enumerate(lsk):
            if value == 1:
                lsk = lsk + distance[i,:]

    for i,value in enumerate(lsk):
        if value > 1:
            lsk[i] = 1
    LSK.append(list(lsk))
    return True
    
def saveLSK(mode):
    data_path = './DATA/pre_{}.csv'.format(mode)
    
    df = pd.read_csv(data_path)

    df.apply(lambda row: getLSK(row['text'], row['target']),axis=1)
    print(len(LSK))
    
    save_path = './DATA/{}/LSK.npy'.format(mode)
    np.save(save_path,LSK)

In [25]:
LSK = []
saveLSK('train')

LSK = []
saveLSK('test')

3608
96


# InputLayer

In [5]:
corpus = {}
glove_model_path = './GLOVE_MODEL/glove300d.txt'

with open(glove_model_path, 'r') as f:
    for i,line in enumerate(f):
        c = line.split(" ")[0]
        corpus[c] = i
    f.close()
  

In [6]:
import linecache
print(linecache.getline(glove_model_path,5))

to 0.31924 0.06316 -0.27858 0.2612 0.079248 -0.21462 -0.10495 0.15495 -0.03353 2.4834 -0.50904 0.08749 0.21426 0.22151 -0.25234 -0.097544 -0.1927 1.3606 -0.11592 -0.10383 0.21929 0.11997 -0.11063 0.14212 -0.16643 0.21815 0.0042086 -0.070012 -0.23532 -0.26518 0.031248 0.16669 -0.089777 0.20059 0.31614 -0.5583 0.075735 0.27635 0.12741 -0.18185 -0.12722 0.024686 -0.077233 -0.48998 0.020355 0.0039164 0.1215 0.089723 -0.078975 0.081443 -0.099087 -0.055621 0.10737 -0.0044042 0.48496 0.11717 -0.017329 0.109 -0.35558 0.051084 0.15714 0.17961 -0.29711 0.033645 -0.025792 -0.013931 -0.23 -0.040306 0.22282 -0.013544 0.011554 0.3911 0.26533 -0.31012 0.40539 -0.042975 0.020811 -0.33033 0.19573 -0.037958 0.10274 -0.0013581 -0.44505 0.077886 0.08511 -0.20285 -0.19481 0.056933 0.53105 0.034154 -0.56996 -0.18469 0.093403 0.28044 -0.23349 0.10938 -0.014288 -0.274 0.034196 -0.098479 0.13268 0.19437 0.13463 -0.099059 0.040324 -0.66272 0.3571 0.15429 0.18598 0.087542 0.080538 -0.25121 0.24155 0.1783 0.03601

In [7]:
# def getline(line_num):
#     # row num begin from 1 not 0
#     if line_num < 1 :return ''
#     for currline,line in enumerate(open(glove_model_path,'rU')):
#         if currline == line_num -1 : return line
#     return ''
def getLine(line_num):
    return linecache.getline(glove_model_path,line_num)

def gloveVec(inp):
    """
    transform each word(sentence) to Glove vector(300 dim)
    """
    glove_vecs = [] 
    for sent in inp:
        sent = sent.split(' ')
        glove_vec = []
        for w in sent:
            try:
                index = corpus[w]
                s = getLine(index+1)
                vec = s.split(" ")[1:]
                vec = list(np.array(vec).astype(np.float))
            except KeyError:
                vec = list(np.random.uniform(-0.01,0.01,(300)))
            glove_vec.append(vec)
                
        glove_vecs.append(glove_vec)
    return glove_vecs

def gloveTargetVec(inp):
    """
    transform each word(target) to Glove vector(300 dim)
    """
    vec = np.zeros(300)

    for w in inp:
        try:
            index = corpus[w]
            s = getLine(index+1)
            s = np.array(s.split(" ")[1:])
            s = s.astype(np.float)
            vec = vec + s
        except KeyError:
            vec = vec + np.random.uniform(-0.01,0.01,(300))
    
    vec =  vec/len(inp)
    return list(vec)

def saveGloveVec(mode):
    # mode = 'train' or 'test'
#     train_x_path = './DATA/{}/{}_x.npy'.format(mode,mode)
#     train_target_path = './DATA/{}/{}_target.npy'.format(mode,mode)
    
    path = './DATA/pre_{}.csv'.format(mode)
    data = pd.read_csv(path)
    train_x = data['text']
    train_target = data['target']
    
#     train_x = np.load(train_x_path,allow_pickle=True)
#     train_target = np.load(train_target_path,allow_pickle=True)

    glove_vec = gloveVec(train_x)
    
    glove_vec_path = "./DATA/{}/glove_vec.npy".format(mode)
    np.save(glove_vec_path,glove_vec)
    
    # golve_target_vec = train_target.apply(gloveTargetVec)
    t_len = len(train_target)
    golve_target_vec = []
    for i in range(t_len):
        vec = gloveTargetVec(train_target[i])
        golve_target_vec.append(vec)
    
    glove_target_vec_path = "./DATA/{}/glove_target_vec.npy".format(mode)
    np.save(glove_target_vec_path,golve_target_vec)

In [8]:
saveGloveVec('train')
saveGloveVec('test')