## Movie Review

In [37]:
from load_sentiment import load_sentiment_data
from text_utils import build_dict, generate_sequence
from models import build_lstm

In [32]:
# %load load_sentiment.py

# scikit-learn 0.18
from sklearn.cross_validation import train_test_split
from text_utils import build_dict, generate_sequence

# scikit-learn 0.18 
# from sklearn.model_selection import train_test_split

def load_sentiment_file(filename):
    f = open(filename)
    X = list()
    for line in f:
        X.append(line.strip().split(" "))
    return X

def load_sentiment_corpus(dirpath):
    pos_filename =  dirpath + "rt-polarity.pos"
    neg_filename =  dirpath + "rt-polarity.neg"
    
    X_pos = load_sentiment_file(pos_filename)
    X_neg = load_sentiment_file(neg_filename) 
    
    Y_pos = ["pos"] * len(X_pos)
    Y_neg = ["neg"] * len(X_neg)
    
    X = X_pos + X_neg
    Y = Y_pos + Y_neg
    
    return (X,Y)

def load_subjectivity_corpus(dirpath):
    quote_filename = dirpath + "quote.tok.gt9.5000"
    plot_filename  = dirpath + "plot.tok.gt9.5000"
    
    X_quote = load_sentiment_file(quote_filename)
    X_plot  = load_sentiment_file(plot_filename)
    
    Y_quote = ["quote"] * len(X_quote)
    Y_plot =  ["plot"]  * len(X_plot)

    X = X_quote + X_plot
    Y = Y_quote + Y_plot
    
    return (X,Y)

def load_sentiment_data(dirpath, max_tokens = 0):
    (X,Y) = load_sentiment_corpus(dirpath)
    X_train_sentences, X_test_sentences, y_train_label, y_test_label = train_test_split(X,Y, test_size = 0.1, random_state = 43)
    
    worddict, wordcount = build_dict(X_train_sentences)
    
    if (max_tokens > 0 ):
        filterdict = {k:v for k,v in worddict.iteritems() if v < max_tokens } 
    else: 
        filterdict = worddict
        
    print 'Filtering to', (len(filterdict) + 2) , ' unique words'
    
    X_train = generate_sequence(X_train_sentences, filterdict)
    X_test  = generate_sequence(X_test_sentences, filterdict)
    
    labels = set(y_train_label + y_test_label)
    catdict = {label: idx for (idx, label) in enumerate(labels)}
    
    y_train = [catdict[y] for y in y_train_label]
    y_test  = [catdict[y] for y in y_test_label]
    
    return X_train, X_test, y_train, y_test, worddict

In [33]:
dirpath = "/home/ec2-user/data/rt-polaritydata/"
max_features = 10000

X_train, X_test, Y_train, Y_test, worddict = load_sentiment_data(dirpath, max_tokens=max_features)

Building dictionary.. 201209  total words  20247  unique words
Filtering to 10000  unique words


In [34]:
nb_classes = len(set(Y_train + Y_test))
print('Num. classes:', nb_classes)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

maxlen_train = max([len(x) for x in X_train])
maxlen_test  = max([len(x) for x in X_test])
print('Max length train', maxlen_train)
print('Max length tet', maxlen_test)

('Num. classes:', 2)
(9595, 'train sequences')
(1067, 'test sequences')
('Max length train', 56)
('Max length tet', 59)


In [35]:
maxlen = 60  
batch_size = 32

In [36]:
from keras.preprocessing import sequence

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Using Theano backend.
Using gpu device 0: GRID K520 (CNMeM is disabled, cuDNN 5103)


Pad sequences (samples x time)
('X_train shape:', (9595, 60))
('X_test shape:', (1067, 60))


In [38]:
from keras.utils import np_utils

print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
Y_train = np_utils.to_categorical(Y_train, nb_classes)
Y_test = np_utils.to_categorical(Y_test, nb_classes)
print('Y_train shape:', Y_train.shape)
print('Y_test shape:', Y_test.shape)

Convert class vector to binary class matrix (for use with categorical_crossentropy)
('Y_train shape:', (9595, 2))
('Y_test shape:', (1067, 2))


In [39]:
model = build_lstm(max_features, 128, nb_classes)
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, None, 128)     1280000     embedding_input_1[0][0]          
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 128)           131584      embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 2)             258         lstm_1[0][0]                     
____________________________________________________________________________________________________
activation_1 (Activation)        (None, 2)             0           dense_1[0][0]                    
Total params: 1,411,842
Trainable params: 1,411,842
Non-trainable params: 0
_______________

In [40]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


In [None]:
print('Train...')
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=15,
          validation_data=(X_test, Y_test))
score, acc = model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
