## Movie Review

In [None]:
# %load load_sentiment.py

def load_sentiment_file(filename):
    f = open(filename)
    X = list()
    for line in f:
        X.append(line.strip().split(" "))
    return X

def load_sentiment_corpus(filepath):
    pos_filename =  dirpath + "rt-polarity.pos"
    neg_filename =  dirpath + "rt-polarity.neg"
    
    X_pos = load_sentiment_file(pos_filename)
    X_neg = load_sentiment_file(neg_filename) 
    
    Y_pos = ["pos"] * len(X_pos)
    Y_neg = ["neg"] * len(X_neg)
    
    X = X_pos + X_neg
    Y = Y_pos + Y_neg
    
    return (X,Y)

def load_subjectivity_corpus(filepath):
    quote_filename = dirpath + "quote.tok.gt9.5000"
    plot_filename  = dirpath + "plot.tok.gt9.5000"
    
    X_quote = load_sentiment_file(quote_filename)
    X_plot  = load_sentiment_file(plot_filename)
    
    Y_quote = ["quote"] * len(X_quote)
    Y_plot =  ["plot"]  * len(X_plot)

    X = X_quote + X_plot
    Y = Y_quote + Y_plot
    
    return (X,Y)

In [None]:
import numpy as np

def build_dict(sentences):
#    from collections import OrderedDict

    '''
    Build dictionary of train words
    Outputs: 
     - Dictionary of word --> word index
     - Dictionary of word --> word count freq
    '''
    print 'Building dictionary..',
    wordcount = dict()
    #For each worn in each sentence, cummulate frequency
    for ss in sentences:
        for w in ss:
            if w not in wordcount:
                wordcount[w] = 1
            else:
                wordcount[w] += 1

    counts = wordcount.values() # List of frequencies
    keys = wordcount.keys() #List of words
    
    sorted_idx = reversed(np.argsort(counts))
    
    worddict = dict()
    for idx, ss in enumerate(sorted_idx):
        worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)
    print np.sum(counts), ' total words ', len(keys), ' unique words'

    return worddict, wordcount

def generate_sequence(sentences, dictionary):
    '''
    Convert tokenized text in sequences of integers
    '''
    seqs = [None] * len(sentences)
    for idx, ss in enumerate(sentences):
        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in ss]

    return seqs


In [7]:
# scikit-learn 0.18
from sklearn.cross_validation import train_test_split

# scikit-learn 0.18 
# from sklearn.model_selection import train_test_split

def load_sentiment_data(path):
    (X,Y) = load_sentiment_corpus(dirpath)
    X_train_sentences, X_test_sentences, y_train_label, y_test_label = train_test_split(X,Y, test_size = 0.1, random_state = 43)
    
    worddict, wordcount = build_dict(X_train_sentences)
    
    X_train = generate_sequence(X_train_sentences, worddict)
    X_test  = generate_sequence(X_test_sentences, worddict)
    
    labels = set(y_train_label + y_test_label)
    catdict = {label: idx for (idx, label) in enumerate(labels)}
    
    y_train = [catdict[y] for y in y_train_label]
    y_test  = [catdict[y] for y in y_test_label]
    
    return X_train, X_test, y_train, y_test

In [10]:
dirpath = "/home/ec2-user/data/rt-polaritydata/"

X_train, X_test, Y_train, Y_test = load_sentiment_data(dirpath)

Building dictionary.. 201209  total words  20247  unique words


In [11]:
nb_classes = len(set(Y_train + Y_test))
print('Num. classes:', nb_classes)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

maxlen_train = max([len(x) for x in X_train])
maxlen_test  = max([len(x) for x in X_test])
print('Max length train', maxlen_train)
print('Max length tet', maxlen_test)

('Num. classes:', 2)
(9595, 'train sequences')
(1067, 'test sequences')
('Max length train', 56)
('Max length tet', 59)


In [12]:
max_features = 30000
maxlen = 60  
batch_size = 32

In [13]:
from keras.preprocessing import sequence

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Using Theano backend.
Using gpu device 0: GRID K520 (CNMeM is disabled, cuDNN 5103)


Pad sequences (samples x time)
('X_train shape:', (9595, 60))
('X_test shape:', (1067, 60))


In [14]:
# %load models.py

from keras.utils import np_utils

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding
from keras.layers import Convolution1D, MaxPooling1D
from keras.layers import LSTM, GRU, SimpleRNN
#from keras.layers import Input, Bidirectional

def build_lstm(max_features, embedding_dims, nb_classes):  
   model = Sequential()

   model.add(Embedding(max_features, 128, dropout=0.2))
   model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) 
   model.add(Dense(nb_classes))
   model.add(Activation('softmax'))

   return model


def build_cnn(embedding_dims, maxlen, nb_filter, filter_length, hidden_dims, nb_classes ) : 
    model = Sequential()
    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen,
                    dropout=0.2))

    # we add a Convolution1D, which will learn nb_filter
    # word group filters of size filter_length:
    model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))
    # we use max pooling:
    model.add(MaxPooling1D(pool_length=model.output_shape[1]))

    # We flatten the output of the conv layer,
    # so that we can add a vanilla dense layer:
    model.add(Flatten())

    # We add a vanilla hidden layer:
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))

    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    
    return model


def build_cnn_lstm(embedding_size, maxlen, nb_filter, filter_length, pool_length, lstm_output_size, nb_classes):
    model = Sequential()
    model.add(Embedding(max_features, embedding_size, input_length=maxlen))
    model.add(Dropout(0.25))
    model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))
    model.add(MaxPooling1D(pool_length=pool_length))
    model.add(LSTM(lstm_output_size))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
     
    return model     

#def build_bidirectional_lstm(embedding_dims, lstm_output_size, nb_classes):
#    model = Sequential()
#    model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
#    model.add(Bidirectional(LSTM(lstm_output_size)))
#    model.add(Dropout(0.5))
#    model.add(Dense(nb_classes, activation='softmax'))
    
#    return model

In [15]:
from keras.utils import np_utils

print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
Y_train = np_utils.to_categorical(Y_train, nb_classes)
Y_test = np_utils.to_categorical(Y_test, nb_classes)
print('Y_train shape:', Y_train.shape)
print('Y_test shape:', Y_test.shape)

Convert class vector to binary class matrix (for use with categorical_crossentropy)
('Y_train shape:', (9595, 2))
('Y_test shape:', (1067, 2))


In [16]:
model = build_lstm(max_features, 128, nb_classes)
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, None, 128)     3840000     embedding_input_1[0][0]          
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 128)           131584      embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 2)             258         lstm_1[0][0]                     
____________________________________________________________________________________________________
activation_1 (Activation)        (None, 2)             0           dense_1[0][0]                    
Total params: 3,971,842
Trainable params: 3,971,842
Non-trainable params: 0
_______________

In [17]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


In [None]:
print('Train...')
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=15,
          validation_data=(X_test, Y_test))
score, acc = model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
