In [0]:
import os
import numpy as np
import pandas as pd
import spacy


In [0]:
import io
train = pd.read_csv('traindata.csv',sep="\t",header=None)
test =  pd.read_csv('devdata.csv',sep="\t",header=None)

In [0]:
nlp = spacy.load('en')

In [0]:
#function to extract adjective, verb from reviews(words that express emotion)

def filter_av(text):
  tok = nlp(text)
  av = ""
  av_lst = [item.lemma_+ " " for item in tok if (not item.is_stop and not item.is_punct and (item.pos_ == "VERB" or item.pos_ == "ADJ"))]
  return av.join(av_lst)

In [0]:
def preprocess(df):
  df.columns = ['polarity','category','aspect','offsets','review']
  df["review_av"] = df["review"].apply(filter_av)
  return df

In [0]:
train = preprocess(train)

In [0]:
train.head()

Unnamed: 0,polarity,category,aspect,offsets,review,review_av
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...,short sweet great be romantic cozy private
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...,quaint romantic
2,positive,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thi...,different offer thi happy delicious recommend
3,negative,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.,should be
4,positive,FOOD#STYLE_OPTIONS,menu,4:8,"The menu looked great, and the waiter was very...",look great nice come average


In [0]:
# apply the function to test df
test = preprocess(test)

In [0]:
# use keras to tokenize
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
import numpy as np 

# prepare embedding matrix
# set dim, max_words
embedding_dim = 100
max_words = 10000
maxlen = 100

tokenizer = Tokenizer(num_words=max_words) 
tokenizer.fit_on_texts(train.review_av)
sequences = tokenizer.texts_to_sequences(train.review_av) 

x_train = pad_sequences(sequences, maxlen=maxlen)
x_test = tokenizer.texts_to_sequences(test['review_av'])
x_test = pad_sequences(x_test, maxlen=maxlen)


In [0]:
# categorize label
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

encoder = LabelEncoder()
y_train = encoder.fit_transform(train["polarity"])
y_train = to_categorical(y_train)

y_test = encoder.transform(test["polarity"])
y_test = to_categorical(y_test)

In [0]:
from keras.layers import LSTM
 
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen)) 
model.add(LSTM(32,dropout_U = 0.2, dropout_W = 0.2))
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='sigmoid')) 
model.summary()

  """


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_37 (Embedding)     (None, 100, 100)          1000000   
_________________________________________________________________
lstm_19 (LSTM)               (None, 32)                17024     
_________________________________________________________________
dense_73 (Dense)             (None, 16)                528       
_________________________________________________________________
dropout_29 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_74 (Dense)             (None, 3)                 51        
Total params: 1,017,603
Trainable params: 1,017,603
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(optimizer='rmsprop',               
              loss='categorical_crossentropy',               
              metrics=['acc']) 
history = model.fit(x_train, y_train,                     
                    epochs=8,                     
                    batch_size=32,
                   validation_data=(x_test, y_test))

Train on 1503 samples, validate on 376 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
