# Imports

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
from nltk import word_tokenize
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from keras.callbacks import Callback
from keras.models import model_from_json

Using TensorFlow backend.


Estimate sentiment of words using word vectors.  
In this section, we'll use the SemEval-2015 English Twitter Sentiment Lexicon.  
The lexicon was used as an official test set in the SemEval-2015 shared Task #10: Subtask E, 
and contains a polarity score 
for words in range -1 (negative) to 1 (positive) - http://saifmohammad.com/WebPages/SCL.html#OPP


Build a classifier for the sentiment of a word given its word vector. Split the data to a train and test sets, and report
 the model performance on both sets.

Use your trained model from the previous question to predict the sentiment score of words in the lyrics corpus that are not
part of the original sentiment dataset. Review the words with the highest positive and negative sentiment.
Do the results make sense?

In [4]:
#basic cleaning 
import re
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords # Import the stop word list
from gensim.models import Word2Vec



#Preprocessing tasks:
#1) Remove punctuation marks, numbers and non-text characters.
#2) Move to lowercase.
#5) Tokenization with a stemmer.


def Clean_non_alphabeta(x):
    x = re.sub("[^a-zA-Z\n]"," ",str(x))
    return x


def Remove_words(x, word_set):
    y = "\n".join(
                [" ".join(
                    [w for w in line.split(' ') if w not in word_set]
                          )
                    for line in x.split('\n')
                ]
    )
    return y


def Apply_stemmer(x, stemmer):
    y = "\n".join(
                [" ".join(
                    [stemmer.stem(w) for w in line.split(' ')]
                          )
                    for line in x.split('\n')
                ]
    )
    return y




df = pd.read_csv('/Temp/SemEval2015-English-Twitter-Lexicon.csv')
df=df.dropna(subset=['SentimentText'])
df['SentimentText'] = df.SentimentText.astype('str')
df['SentimentText']=df['SentimentText'].apply(lambda x: x.lower())
df['SentimentText']=df['SentimentText'].apply(Clean_non_alphabeta)
porter = nltk.PorterStemmer()
df['SentimentText']=df['SentimentText'].apply(lambda x: Apply_stemmer(x, porter))
df.to_csv('/Temp/SemEval2015-English-Twitter-Lexicon_basic_cleaning.csv', index=None)
df.head()

Unnamed: 0,rank,SentimentText,Sentiment
0,0.984,love,1
1,0.984,inspir,1
2,0.969,amaz,1
3,0.969,peac,1
4,0.953,great,1


In [11]:
#Splitting for training and testing
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(1000000).tokens),
                                                    np.array(data.head(1000000).Sentiment), test_size=0.2)


# Build tweet vector to give input to FFNN

In [0]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word] #combining w2v vectors with tfidf value of words in the tweet.
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

# Training 3 layered FFNN

In [0]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=100, batch_size=10000, verbose=2)

Epoch 1/100
 - 0s - loss: 0.7618 - acc: 0.4813
Epoch 2/100
 - 0s - loss: 0.7273 - acc: 0.5204
Epoch 3/100
 - 0s - loss: 0.7058 - acc: 0.5537
Epoch 4/100
 - 0s - loss: 0.6888 - acc: 0.5812
Epoch 5/100
 - 0s - loss: 0.6750 - acc: 0.5987
Epoch 6/100
 - 0s - loss: 0.6633 - acc: 0.6203
Epoch 7/100
 - 0s - loss: 0.6531 - acc: 0.6436
Epoch 8/100
 - 0s - loss: 0.6439 - acc: 0.6611
Epoch 9/100
 - 0s - loss: 0.6354 - acc: 0.6686
Epoch 10/100
 - 0s - loss: 0.6276 - acc: 0.6869
Epoch 11/100
 - 0s - loss: 0.6202 - acc: 0.7036
Epoch 12/100
 - 0s - loss: 0.6132 - acc: 0.7111
Epoch 13/100
 - 0s - loss: 0.6064 - acc: 0.7227
Epoch 14/100
 - 0s - loss: 0.5998 - acc: 0.7344
Epoch 15/100
 - 0s - loss: 0.5935 - acc: 0.7460
Epoch 16/100
 - 0s - loss: 0.5874 - acc: 0.7535
Epoch 17/100
 - 0s - loss: 0.5815 - acc: 0.7627
Epoch 18/100
 - 0s - loss: 0.5758 - acc: 0.7660
Epoch 19/100
 - 0s - loss: 0.5699 - acc: 0.7760
Epoch 20/100
 - 0s - loss: 0.5643 - acc: 0.7835
Epoch 21/100
 - 0s - loss: 0.5582 - acc: 0.7868
E

<keras.callbacks.History at 0x7f6dc577c278>

In [0]:
# Evaluating accuracy score

score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print(model.metrics_names[0],": ",score[0],"\n",model.metrics_names[1],": ",score[1])

loss :  0.8369897011902642 
 acc :  0.5714285742008804


# Saving model

In [0]:
#Saving the model
model_json = model.to_json() # serialize model to JSON
with open("model_my_new.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("smodel_my_new.h5") # serialize weights to HDF5
print("Saved model to disk")

#Loading the model
# newmodel = model_from_json(open('model.json').read())
# newmodel.load_weights('smodel.h5')

Saved model to disk


# Predicting for test file (Validation)

In [0]:
def ingesttest():
    testdata = pd.read_csv('/content/data/trainingandtestdata/tweetstest.csv', encoding='latin-1')
    testdata.columns=["Sentiment","ItemID","Date","Blank","SentimentSource","SentimentText"]
    testdata.drop(['ItemID', 'SentimentSource'], axis=1, inplace=True)
    testdata = testdata[testdata.Sentiment.isnull() == False]
    testdata['Sentiment'] = testdata['Sentiment'].map( {4:1, 0:0, 2:1})
    testdata = testdata[testdata['SentimentText'].isnull() == False]
    testdata.reset_index(inplace=True)
    testdata.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', testdata.shape  )  
    return testdata

testdata = ingesttest()

dataset loaded with shape (497, 4)


In [0]:
test_X=np.array(testdata.tokens)
test_y=np.array(testdata.Sentiment)

In [0]:
test_w2v_vecs = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x:x, test_X))])










  









497it [00:00, 6312.15it/s][A[A[A[A[A[A[A[A[A

In [0]:
# model.predict_classes(test_w2v_vecs)
score = model.evaluate(test_w2v_vecs,test_y, batch_size=128, verbose=2)
print(model.metrics_names[0],": ",score[0],"\n",model.metrics_names[1],": ",score[1])

loss :  0.6691735330481884 
 acc :  0.6438631725982881


# Testing the model test dataset