In [1]:
# import the required libraries
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from sklearn.preprocessing import LabelEncoder

In [2]:
# read the given data set using pandas
data = pd.read_csv('Sentiment.csv')
data.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [3]:
# get the counts
print(data.groupby('sentiment').nunique())

             id  candidate  candidate_confidence  relevant_yn  \
sentiment                                                       
Negative   8493         11                   676            1   
Neutral    3142         11                   494            1   
Positive   2236         11                   338            1   

           relevant_yn_confidence  sentiment_confidence  subject_matter  \
sentiment                                                                 
Negative                      358                   895              12   
Neutral                       313                   686              12   
Positive                      181                   562              11   

           subject_matter_confidence  candidate_gold  name  relevant_yn_gold  \
sentiment                                                                      
Negative                        1111               6  6334                 1   
Neutral                          704               3  2754

In [4]:
# Keeping only the neccessary columns
data = data[['text','sentiment']]
data.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [5]:
# printed few samples
print(type(data['text'][0]))
print(data['text'][1])
print(data['text'][2])

<class 'str'>
RT @ScottWalker: Didn't catch the full #GOPdebate last night. Here are some of Scott's best lines in 90 seconds. #Walker16 http://t.co/ZSfF…
RT @TJMShow: No mention of Tamir Rice and the #GOPDebate was held in Cleveland? Wow.


In [6]:
data['text'] = data['text'].apply(lambda x: x.lower()) # all string to lowercase Read more about lambda() "https://realpython.com/python-lambda/"
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x))) # using regular expression preprocess the text by removing everything that is not [a-zA-z0-9\s]

In [7]:
print(type(data['text']))

<class 'pandas.core.series.Series'>


In [8]:
print(data['text'][0])
print(data['text'][1])
print(data['text'][2])

rt nancyleegrahn how did everyone feel about the climate change question last night exactly gopdebate
rt scottwalker didnt catch the full gopdebate last night here are some of scotts best lines in 90 seconds walker16 httptcozsff
rt tjmshow no mention of tamir rice and the gopdebate was held in cleveland wow


In [9]:
for idx, row in data.iterrows(): # Iterate over DataFrame rows as (index, Series) pairs.
    row[0] = row[0].replace('rt', '') # replace/ remove 'rt' in the start of the text

In [10]:
# printed the samples again 
print(data['text'][0])
print(data['text'][1])
print(data['text'][2])

 nancyleegrahn how did everyone feel about the climate change question last night exactly gopdebate
 scottwalker didnt catch the full gopdebate last night here are some of scotts best lines in 90 seconds walker16 httptcozsff
 tjmshow no mention of tamir rice and the gopdebate was held in cleveland wow


In [11]:
max_fatures = 2000

tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)

In [12]:
# Transforms each text in texts to a sequence of integers.
# Only top num_words-1 most frequent words will be taken into account. Only words known by the tokenizer will be taken into account.

# Transforms each text in texts to a sequence of integers. 
# So it basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary.
X = tokenizer.texts_to_sequences(data['text'].values)

In [13]:
# tokenizer.get_config()
# Returns the tokenizer configuration as Python dictionary.
print(tokenizer.get_config().keys())

# word_counts: A dictionary of words and their counts.
print("\ntokenizer.word_counts") 
print(tokenizer.word_counts) 

# document_count:An integer count of the total number of documents that were used to fit the Tokenizer.
print("\ntokenizer.document_count")
print(tokenizer.document_count) 

# word_index: A dictionary of words and their uniquely assigned integers.
print("\ntokenizer.word_index")
print(tokenizer.word_index)

# word_docs: A dictionary of words and how many documents each appeared in.
print("\ntokenizer.word_docs")
print(tokenizer.word_docs)



print('\nLen() of X:', len(X))
print('\n', X[:2])

dict_keys(['num_words', 'filters', 'lower', 'split', 'char_level', 'oov_token', 'document_count', 'word_counts', 'word_docs', 'index_docs', 'index_word', 'word_index'])

tokenizer.word_counts

tokenizer.document_count
13871

tokenizer.word_index

tokenizer.word_docs

Len() of X: 13871

 [[52, 78, 338, 449, 22, 2, 413, 361, 95, 29, 51, 1023, 1], [348, 123, 1937, 2, 588, 1, 29, 51, 226, 35, 195, 5, 172, 1400, 10, 1561, 1336, 833]]


In [14]:
# Check the sequence of the text, Do we need to Pad ??
for i in range(4):
  print(X[i])
  print('len=', len(X[i]))

[52, 78, 338, 449, 22, 2, 413, 361, 95, 29, 51, 1023, 1]
len= 13
[348, 123, 1937, 2, 588, 1, 29, 51, 226, 35, 195, 5, 172, 1400, 10, 1561, 1336, 833]
len= 18
[62, 491, 5, 8, 2, 1, 21, 1781, 10, 685, 512]
len= 11
[17, 276, 235, 6, 723, 96, 164, 24, 130, 5, 2, 176, 10, 1, 211, 773, 16]
len= 17


In [15]:
X = pad_sequences(X) # Pads sequences to the same length.
print('X.shape = ', X.shape)

X.shape =  (13871, 28)


In [16]:
# Check the sequence after padding, Which padding pre or post??
for i in range(4):
  print(X[i])
  print('len=', len(X[i]))

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0   52   78  338  449   22    2  413  361   95   29   51 1023    1]
len= 28
[   0    0    0    0    0    0    0    0    0    0  348  123 1937    2
  588    1   29   51  226   35  195    5  172 1400   10 1561 1336  833]
len= 28
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0   62  491    5    8    2    1   21 1781   10  685  512]
len= 28
[  0   0   0   0   0   0   0   0   0   0   0  17 276 235   6 723  96 164
  24 130   5   2 176  10   1 211 773  16]
len= 28


In [17]:
embed_dim = 128
lstm_out = 196
# created the model
def createmodel():
    model = Sequential()
    # created the embedding layer
    model.add(Embedding(max_fatures, embed_dim, input_length = X.shape[1]))
    # created the LSTM layer
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    model.summary()
    return model

In [18]:
# encoded the target variable
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)

X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [19]:
model = createmodel()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 28, 128)           256000    
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 3)                 591       
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________


In [20]:
batch_size = 32
# trained the model after compilation
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 1)
# evalutated the model after training
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
# printed the metrics
print(score)
print(acc)
print(model.metrics_names)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
144/144 - 1s - loss: 0.9404 - accuracy: 0.6553
0.9403626918792725
0.6553080081939697
['loss', 'accuracy']


In [23]:
# saved the model
model.save("model1.h5")

In [24]:
from keras.models import load_model
 
# load model
model1 = load_model('model1.h5')

In [25]:
# read the to be predicted data
data = pd.read_csv('predictquestion1.csv', encoding='Windows-1252')
data.head()
data['0'][0]

"“A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump”"

In [26]:
# did the preprocessing to the data which is to predicted
data['0'] = data['0'].apply(lambda x: x.lower()) # all string to lowercase Read more about lambda() "https://realpython.com/python-lambda/"
data['0'] = data['0'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))
import numpy as np
tokenizer.fit_on_texts(data['0'].values)
# convertd to the integer format
Xp = tokenizer.texts_to_sequences(data['0'].values)
# manually padded the sequence at the front of the data
arr = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])
arr = np.append(arr,Xp[0])
#input data to the model
print("input data to the model"+str(arr))
# predicted using the saved model
X=model1.predict(arr)
print("Predicted Value",np.argmax(X[1]))

input data to the model[   0    0    0    0    0    0    0    0    0    7  423    5  143  282
   35   29   35  363 1770    2  338    8  251    7  149]
Predicted Value 2
