## Data Source:

http://thinknook.com/twitter-sentiment-analysis-training-corpus-dataset-2012-09-22/

## Word Embeddings:

https://nlp.stanford.edu/projects/glove/

In [219]:
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import Embedding, Flatten, Dense, Dropout, Conv1D, MaxPool1D, concatenate, Input, Reshape, Conv2D, MaxPool2D
from keras.optimizers import Adam
import numpy as np
import spacy
from collections import Counter

In [116]:
##Hyper-parameters

n_total_sentences = 10000
size_vocab = 5000
sentence_size = 25

## Pre-process Twitter Data

In [60]:
with open("../data/Sentiment Analysis Dataset.csv", "r") as f:
    sentiment = []
    sentences = []
    for i, line in enumerate(f):
        if i == 0: continue
        columns = line.split(",")
        sentiment.append(columns[1])
        sentences.append(columns[3].strip().lower())

In [61]:
print(len(sentiment))

1578627


In [62]:
sentences = sentences[:n_total_sentences]
sentiment = sentiment[:n_total_sentences]

In [63]:
print(sentences[0])
print(sentiment[0])

is so sad for my apl friend.............
0


In [64]:
nlp = spacy.load('en')



    Only loading the 'en' tokenizer.



In [65]:
tokenized_sentences = [nlp(s) for s in sentences]

In [66]:
all_vocab = [w.text for s in tokenized_sentences for w in s]

In [67]:
print(len(all_vocab))

135482


In [68]:
word_counts = Counter(all_vocab)

In [98]:
print("N Words: {}".format(len(word_counts)))
print("Average number of times a word appears: {}".format(np.mean(list(word_counts.values()))))

N Words: 17688
Average number of times a word appears: 7.659543193125283


In [76]:
word2index = {w[0]: i for i, w in enumerate(word_counts.most_common(size_vocab))}
index2word = {v: k for k, v in word2index.items()}

In [90]:
def sentences_to_index(sentences):
    indexed_sentences = []
    for s in sentences:
        tmp = []
        for w in s:
            try:
                tmp.append(word2index[w.text])
            except:
                tmp.append(size_vocab)
        indexed_sentences.append(tmp)
    return indexed_sentences


def indexed_to_words(sentence):
    words = []
    for index in sentence:
        try:
            words.append(index2word[index])
        except:
            words.append("<unk>")
    return words

sentences_indexed = sentences_to_index(tokenized_sentences)

In [91]:
tokenized_sentences[0]

is so sad for my apl friend.............

In [92]:
indexed_to_words(sentences_indexed[0])

['is', 'so', 'sad', 'for', 'my', '<unk>', 'friend', '.............']

In [105]:
msk = np.random.rand(len(sentences_indexed)) < 0.8
trn_sentences = np.array(sentences_indexed)[msk]
val_sentences = np.array(sentences_indexed)[~msk]
trn_sentiment = np.array(sentiment)[msk]
val_sentiment = np.array(sentiment)[~msk]

In [106]:
print("Training Size: {}".format(len(trn_sentences)))
print("Validation Size: {}".format(len(val_sentences)))

Training Size: 7948
Validation Size: 2052


In [115]:
lens = np.array(list(map(len, trn_sentences)))
(lens.max(), lens.min(), lens.mean())

(89, 1, 13.535480624056367)

In [119]:
trn_sentences = sequence.pad_sequences(trn_sentences, maxlen=sentence_size, value=size_vocab+1)
val_sentences = sequence.pad_sequences(val_sentences, maxlen=sentence_size, value=size_vocab+1)

In [120]:
trn_sentences.shape

(7948, 25)

## Simple NN

In [163]:
model = Sequential([
    Embedding(size_vocab+2, 64, input_length=sentence_size),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [164]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 25, 64)            320128    
_________________________________________________________________
flatten_11 (Flatten)         (None, 1600)              0         
_________________________________________________________________
dense_18 (Dense)             (None, 100)               160100    
_________________________________________________________________
dropout_9 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 101       
Total params: 480,329
Trainable params: 480,329
Non-trainable params: 0
_________________________________________________________________


In [165]:
model.fit(trn_sentences, trn_sentiment, validation_data=(val_sentences, val_sentiment), epochs=3, batch_size=64)

Train on 7948 samples, validate on 2052 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x260fef0b8>

## 1D Conv

In [170]:
conv1 = Sequential([
    Embedding(size_vocab+2, 64, input_length=sentence_size),
    Dropout(0.2),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.2),
    MaxPool1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [171]:
conv1.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
conv1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 25, 64)            320128    
_________________________________________________________________
dropout_16 (Dropout)         (None, 25, 64)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 25, 64)            20544     
_________________________________________________________________
dropout_17 (Dropout)         (None, 25, 64)            0         
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 12, 64)            0         
_________________________________________________________________
flatten_14 (Flatten)         (None, 768)               0         
_________________________________________________________________
dense_24 (Dense)             (None, 100)               76900     
__________

In [172]:
conv1.fit(trn_sentences, trn_sentiment, validation_data=(val_sentences, val_sentiment), epochs=3, batch_size=64)

Train on 7948 samples, validate on 2052 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x265475f28>

## Multi-Size CNN

In [225]:
graph_in = Input ((size_vocab+2, 64))
convs = [ ] 
for fsz in range (3, 6): 
    x = Conv1D(64, fsz, padding='same', activation="relu")(graph_in)
    x = MaxPool1D()(x) 
    x = Flatten()(x) 
    convs.append(x)
out = concatenate(convs) 
graph = Model(graph_in, out)

In [226]:
ms_cnn = Sequential ([
    Embedding(size_vocab+2, 64, input_length=sentence_size),
    Dropout (0.2),
    graph,
    Dropout (0.5),
    Dense (100, activation="relu"),
    Dropout (0.7),
    Dense (1, activation='sigmoid')
    ])

In [227]:
ms_cnn.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
ms_cnn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_24 (Embedding)     (None, 25, 64)            320128    
_________________________________________________________________
dropout_37 (Dropout)         (None, 25, 64)            0         
_________________________________________________________________
model_7 (Model)              multiple                  49344     
_________________________________________________________________
dropout_38 (Dropout)         (None, 2304)              0         
_________________________________________________________________
dense_38 (Dense)             (None, 100)               230500    
_________________________________________________________________
dropout_39 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_39 (Dense)             (None, 1)                 101       
Total para

In [228]:
ms_cnn.fit(trn_sentences, trn_sentiment, validation_data=(val_sentences, val_sentiment), epochs=3, batch_size=64)

Train on 7948 samples, validate on 2052 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x26e32d748>

## Glove Embeddings - TODO

## Predict

In [229]:
def raw_sentence_to_index(sentence):
    s = nlp(sentence)
    ind_s = []
    for w in s:
        try:
            ind_s.append(word2index[w.text])
        except:
            ind_s.append(size_vocab)
    padded = sequence.pad_sequences([ind_s], maxlen=sentence_size, value=size_vocab+1)[0]
    return np.expand_dims(padded, 1).T

In [230]:
indx = raw_sentence_to_index("I love this movie!")

In [231]:
ms_cnn.predict(indx)

array([[ 0.9045459]], dtype=float32)

In [232]:
indx = raw_sentence_to_index("The movie seemed good, but in the end, was terrible.")
ms_cnn.predict(indx)

array([[ 0.02032357]], dtype=float32)