## Data Source:

http://thinknook.com/twitter-sentiment-analysis-training-corpus-dataset-2012-09-22/

## Word Embeddings:

https://nlp.stanford.edu/projects/glove/

In [64]:
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import Embedding, Flatten, Dense, Dropout, Conv1D, MaxPool1D, concatenate, Input, Reshape, Conv2D, MaxPool2D
from keras.optimizers import Adam
import numpy as np
import spacy
from collections import Counter
import re

In [2]:
##Hyper-parameters

n_total_sentences = 10000
size_vocab = 5000
sentence_size = 25

## Pre-process Twitter Data

In [3]:
with open("../data/Sentiment Analysis Dataset.csv", "r") as f:
    sentiment = []
    sentences = []
    for i, line in enumerate(f):
        if i == 0: continue
        columns = line.split(",")
        sentiment.append(columns[1])
        sentences.append(columns[3].strip().lower())

In [4]:
print(len(sentiment))

1578627


In [5]:
sentences = sentences[:n_total_sentences]
sentiment = sentiment[:n_total_sentences]

In [6]:
print(sentences[0])
print(sentiment[0])

is so sad for my apl friend.............
0


In [7]:
nlp = spacy.load('en')



    Only loading the 'en' tokenizer.



In [8]:
tokenized_sentences = [nlp(s) for s in sentences]

In [9]:
all_vocab = [w.text for s in tokenized_sentences for w in s]

In [10]:
print(len(all_vocab))

135482


In [11]:
word_counts = Counter(all_vocab)

In [12]:
print("N Words: {}".format(len(word_counts)))
print("Average number of times a word appears: {}".format(np.mean(list(word_counts.values()))))

N Words: 17688
Average number of times a word appears: 7.659543193125283


In [13]:
word2index = {w[0]: i for i, w in enumerate(word_counts.most_common(size_vocab))}
index2word = {v: k for k, v in word2index.items()}

In [14]:
def sentences_to_index(sentences):
    indexed_sentences = []
    for s in sentences:
        tmp = []
        for w in s:
            try:
                tmp.append(word2index[w.text])
            except:
                tmp.append(size_vocab)
        indexed_sentences.append(tmp)
    return indexed_sentences


def indexed_to_words(sentence):
    words = []
    for index in sentence:
        try:
            words.append(index2word[index])
        except:
            words.append("<unk>")
    return words

sentences_indexed = sentences_to_index(tokenized_sentences)

In [15]:
tokenized_sentences[0]

is so sad for my apl friend.............

In [16]:
indexed_to_words(sentences_indexed[0])

['is', 'so', 'sad', 'for', 'my', '<unk>', 'friend', '.............']

In [17]:
msk = np.random.rand(len(sentences_indexed)) < 0.8
trn_sentences = np.array(sentences_indexed)[msk]
val_sentences = np.array(sentences_indexed)[~msk]
trn_sentiment = np.array(sentiment)[msk]
val_sentiment = np.array(sentiment)[~msk]

In [18]:
print("Training Size: {}".format(len(trn_sentences)))
print("Validation Size: {}".format(len(val_sentences)))

Training Size: 8045
Validation Size: 1955


In [19]:
lens = np.array(list(map(len, trn_sentences)))
(lens.max(), lens.min(), lens.mean())

(60, 1, 13.462150403977626)

In [20]:
trn_sentences = sequence.pad_sequences(trn_sentences, maxlen=sentence_size, value=size_vocab+1)
val_sentences = sequence.pad_sequences(val_sentences, maxlen=sentence_size, value=size_vocab+1)

In [21]:
trn_sentences.shape

(8045, 25)

## Simple NN

In [22]:
model = Sequential([
    Embedding(size_vocab+2, 64, input_length=sentence_size),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [23]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 64)            320128    
_________________________________________________________________
flatten_1 (Flatten)          (None, 1600)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               160100    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 480,329
Trainable params: 480,329
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.fit(trn_sentences, trn_sentiment, validation_data=(val_sentences, val_sentiment), epochs=3, batch_size=64)

Train on 8045 samples, validate on 1955 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x118529898>

## 1D Conv

In [25]:
conv1 = Sequential([
    Embedding(size_vocab+2, 64, input_length=sentence_size),
    Dropout(0.2),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.2),
    MaxPool1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [26]:
conv1.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
conv1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 25, 64)            320128    
_________________________________________________________________
dropout_2 (Dropout)          (None, 25, 64)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 25, 64)            20544     
_________________________________________________________________
dropout_3 (Dropout)          (None, 25, 64)            0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 12, 64)            0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 768)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               76900     
__________

In [27]:
conv1.fit(trn_sentences, trn_sentiment, validation_data=(val_sentences, val_sentiment), epochs=3, batch_size=64)

Train on 8045 samples, validate on 1955 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x11f109cf8>

## Multi-Size CNN

In [28]:
graph_in = Input ((size_vocab+2, 64))
convs = [ ] 
for fsz in range (3, 6): 
    x = Conv1D(64, fsz, padding='same', activation="relu")(graph_in)
    x = MaxPool1D()(x) 
    x = Flatten()(x) 
    convs.append(x)
out = concatenate(convs) 
graph = Model(graph_in, out)

In [29]:
ms_cnn = Sequential ([
    Embedding(size_vocab+2, 64, input_length=sentence_size),
    Dropout (0.2),
    graph,
    Dropout (0.5),
    Dense (100, activation="relu"),
    Dropout (0.7),
    Dense (1, activation='sigmoid')
    ])

In [30]:
ms_cnn.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
ms_cnn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 25, 64)            320128    
_________________________________________________________________
dropout_5 (Dropout)          (None, 25, 64)            0         
_________________________________________________________________
model_1 (Model)              multiple                  49344     
_________________________________________________________________
dropout_6 (Dropout)          (None, 2304)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 100)               230500    
_________________________________________________________________
dropout_7 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 101       
Total para

In [31]:
ms_cnn.fit(trn_sentences, trn_sentiment, validation_data=(val_sentences, val_sentiment), epochs=3, batch_size=64)

Train on 8045 samples, validate on 1955 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x116bc0748>

## Glove Embeddings - TODO

In [77]:
def get_word_vectors(filename):
    vecs, words, wordidx = [], [], {}
    with open(filename, "r") as f:
        for i, line in enumerate(f):
            tokens = line.split(" ")
            vecs.append([float(x) for x in tokens[1:]])
            words.append(tokens[0])
            wordidx[tokens[0]] = i
    return np.array(vecs), np.array(words), wordidx

def create_emb(vecs, words, wordidx, vocab_size):
    # don't do any casing, should i?
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(0,len(emb)-2):
        word = index2word[i]
        if word in wordidx and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = np.random.normal(scale=0.6, size=(n_fact,))
    emb[-2] = np.random.normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [61]:
vecs, words, wordidx = get_word_vectors("../data/glove.twitter.27B.50d.txt")

In [78]:
emb = create_emb(vecs, words, wordidx, size_vocab+2)

In [91]:
conv1_we = Sequential([
    Embedding(size_vocab+2, 50, input_length=sentence_size, 
              weights=[emb], trainable=False),
    Dropout(0.25),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.25),
    MaxPool1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])
conv1_we.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [92]:
conv1_we.fit(trn_sentences, trn_sentiment, validation_data=(val_sentences, val_sentiment), epochs=5, batch_size=64)

Train on 8045 samples, validate on 1955 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x15606a470>

## Precision and Recall

In [93]:
from sklearn.metrics import precision_recall_fscore_support

In [97]:
cutoff_value = 0.5
val_sentiment_int = val_sentiment.astype(np.uint8)
binary_predictions = [x > cutoff_value for x in conv1.predict(val_sentences)]
precision_recall_fscore_support(val_sentiment_int, binary_predictions)

(array([ 0.78242321,  0.74201788]),
 array([ 0.81948168,  0.69497608]),
 array([ 0.80052379,  0.71772699]),
 array([1119,  836]))

## Predict

In [32]:
def raw_sentence_to_index(sentence):
    s = nlp(sentence)
    ind_s = []
    for w in s:
        try:
            ind_s.append(word2index[w.text])
        except:
            ind_s.append(size_vocab)
    padded = sequence.pad_sequences([ind_s], maxlen=sentence_size, value=size_vocab+1)[0]
    return np.expand_dims(padded, 1).T

In [33]:
indx = raw_sentence_to_index("I love this movie!")

In [34]:
ms_cnn.predict(indx)

array([[ 0.82185811]], dtype=float32)

In [35]:
indx = raw_sentence_to_index("The movie seemed good, but in the end, was terrible.")
ms_cnn.predict(indx)

array([[ 0.01227814]], dtype=float32)