# Predict Tweets Sentiment 

Recurrent Neural networks to predict the sentiment of various tweets. We would like to predict the tweets as positive or negative. You can download the dataset here(https://github.com/crwong/cs224u-project/tree/master/data/sentiment).

We have around 1600000 tweets to train our network. Let’s now use RNNs to classify various tweets as positive or negative.

### What i learnt
 - Using pretrained Glove Embeddings
 - Converting input text into Word Embeddings
 - Using RNN in Keras framework

In [1]:
# import all libraries
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers.convolutional import Conv1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import spacy
nlp = spacy.load('en')


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [86]:
#load the dataset
import pandas as pd
train=pd.read_csv("C:/Users/vindla/Downloads/Cg_DS4/AV_NLP/RNN-Tweets/training.1600000.processed.noemoticon.csv" , encoding= "latin-1")
Y_train = train[train.columns[0]]
X_train = train[train.columns[5]]


In [100]:
# split the data into test and train
from sklearn.model_selection import train_test_split
trainset1x, trainset2x, trainset1y, trainset2y = train_test_split(X_train.values, Y_train.values, test_size=0.02,random_state=42 )

trainset2y=pd.get_dummies(trainset2y)
trainset1y=pd.get_dummies(trainset1y)

In [101]:
# function to remove stopwords
def stopwords(sentence):
    new=[]
    sentence=nlp(sentence)
    for w in sentence:
        if (w.is_stop == False) & (w.pos_ !="PUNCT"):
            new.append(w.string.strip())
        c=" ".join(str(x) for x in new)
    return c

In [102]:
# function to lemmatize the tweets
def lemmatize(sentence):
    sentence=nlp(sentence)
    str=""
    for w in sentence:
        str+=" "+w.lemma_
    return nlp(str)

In [103]:
#loading the glove model
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r',encoding="utf8")
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = [float(val) for val in splitLine[1:]]
        model[word] = embedding
    print ("Done."),len(model),(" words loaded!")
    return model

In [104]:
# save the glove model
model=loadGloveModel("C:/Users/vindla/Downloads/Cg_DS4/AV_NLP/RNN-Tweets/glove.twitter.27B.200d.txt")

Loading Glove Model
Done.


In [105]:
#vectorising the sentences
def sent_vectorizer(sent, model):
    sent_vec = np.zeros(200)
    numw = 0
    for w in sent.split():
        try:
            sent_vec = np.add(sent_vec, model[str(w)])
            numw+=1
        except:
            pass
    return sent_vec

In [106]:
trainsetx = trainset1x
trainsety = trainset1y

In [107]:
#obtain a clean vector
cleanvector=[]
for i in range(trainsetx.shape[0]):
    document=trainsetx[i]
    document=document.lower()
    document=lemmatize(document)
    document=str(document)
    cleanvector.append(sent_vectorizer(document,model))

In [108]:
#Getting the input and output in proper shape
cleanvector=np.array(cleanvector)
cleanvector =cleanvector.reshape(len(cleanvector),200,1)

In [109]:
#tokenizing the sequences
tokenizer = Tokenizer(num_words=16000)
tokenizer.fit_on_texts(trainsetx)
sequences = tokenizer.texts_to_sequences(trainsetx)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=15, padding="post")
print(data.shape)

Found 29863 unique tokens.
(19599, 15)


In [110]:
#reshape the data and preparing to train
data=data.reshape(len(cleanvector),15,1)
from sklearn.model_selection import train_test_split
trainx, validx, trainy, validy = train_test_split(data, trainsety, test_size=0.3,random_state=42 )

In [111]:
'''
# Example to understand tokenization and text to sequence
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!']
# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)
# summarize what was learned
print(t.word_counts)
print(t.document_count)
print(t.word_index)
print(t.word_docs)
print(t.texts_to_sequences(docs))
word_index = t.word_index
'''

"\n# Example to understand tokenization and text to sequence\ndocs = ['Well done!',\n\t\t'Good work',\n\t\t'Great effort',\n\t\t'nice work',\n\t\t'Excellent!']\n# create the tokenizer\nt = Tokenizer()\n# fit the tokenizer on the documents\nt.fit_on_texts(docs)\n# summarize what was learned\nprint(t.word_counts)\nprint(t.document_count)\nprint(t.word_index)\nprint(t.word_docs)\nprint(t.texts_to_sequences(docs))\nword_index = t.word_index\n"

In [112]:
#calculate the number of words
nb_words=len(tokenizer.word_index)+1

#obtain theembedding matrix
embedding_matrix = np.zeros((nb_words, 200))
for word, i in word_index.items():
    embedding_vector = model.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

trainy=np.array(trainy)
validy=np.array(validy)

Null word embeddings: 12340


In [113]:
print(trainx.shape)
print(validx.shape)
print(trainy.shape)
print(validy.shape)

(13719, 15, 1)
(5880, 15, 1)
(13719, 2)
(5880, 2)


In [114]:
#building a simple RNN model
def modelbuild():
    model = Sequential()
    model.add(keras.layers.InputLayer(input_shape=(15,1)))
    keras.layers.embeddings.Embedding(nb_words, 15, weights=[embedding_matrix], input_length=15,
    trainable=False)
 
    model.add(keras.layers.recurrent.SimpleRNN(units = 100, activation='relu',
    use_bias=True))
    model.add(keras.layers.Dense(units=1000, input_dim = 2000, activation='sigmoid'))
    model.add(keras.layers.Dense(units=500, input_dim=1000, activation='relu'))
    model.add(keras.layers.Dense(units=2, input_dim=500,activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [115]:
#compiling the model
finalmodel = modelbuild()
finalmodel.fit(trainx, trainy, epochs=10, batch_size=120,validation_data=(validx,validy))

Train on 13719 samples, validate on 5880 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1bc9dc296a0>

In [99]:
# References
finalmodel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 15, 1)             0         
_________________________________________________________________
simple_rnn_7 (SimpleRNN)     (None, 100)               10200     
_________________________________________________________________
dense_19 (Dense)             (None, 1000)              101000    
_________________________________________________________________
dense_20 (Dense)             (None, 500)               500500    
_________________________________________________________________
dense_21 (Dense)             (None, 2)                 1002      
Total params: 612,702
Trainable params: 612,702
Non-trainable params: 0
_________________________________________________________________


In [None]:
'''
References
https://www.analyticsvidhya.com/blog/2017/12/introduction-to-recurrent-neural-networks/

Great article with good functional python code. Thanks

Couple of errors found, here are the resolutions:
1. Parenthesis missing at the end of this.
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']

2. no. of train examples should be len(cleanvectors), instead of 32000
cleanvector =cleanvector.reshape(32000,200,1)

3.  replace record count with len(cleanvectors), instead of 32000
data=data.reshape(32000,15,1)
ValueError: cannot reshape array of size 6000 into shape (32000,15,1)
'''
