# Sentiment Analysis of Text Based Content

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding

In [3]:
df = pd.read_csv('TwitterData.csv')
df.clean_text=df.clean_text.astype(str)

In [4]:
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [5]:
df.columns

Index(['clean_text', 'category'], dtype='object')

In [11]:
df = df[df['category'] != 0.0 ]
print(df.shape)
df.head(5)

(107767, 2)


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
8,with upcoming election india saga going import...,1.0


In [12]:
df['category'] = df['category'].replace(to_replace={-1:'Negative',1:'Positive'}) 

In [13]:
df.dtypes

clean_text    object
category      object
dtype: object

In [14]:
df["category"].value_counts()

Positive    72250
Negative    35510
Name: category, dtype: int64

In [15]:
sentiment_label = df.category.factorize() #Converting categorical values to numeric values
sentiment_label

(array([0, 1, 1, ..., 0, 0, 1], dtype=int64),
 Index(['Negative', 'Positive'], dtype='object'))

In [16]:
#Tokenizing the words/sentences into small parts "Token"
tweet = df.clean_text.values
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweet) #creates an association between the words and the assigned numbers
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(tweet) #replacing the words with their assigned number
padded_sequence = pad_sequences(encoded_docs, maxlen=200) #Use padding to pad the sentences to have equal length.

In [17]:
print(tokenizer.word_index)



In [18]:
print(tweet[0])
print(encoded_docs[0])

when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples
[46, 1, 341, 72, 1993, 885, 40, 2821, 2, 961, 205, 2, 208, 33, 164, 109, 48, 69, 1048, 208, 58, 3, 8, 540, 3, 58, 4281, 3, 2614]


In [19]:
print(padded_sequence[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0   46    1  341   72 1993  885   40 2821    2  961  205
    2  208   33  164  109   48   69 1048  208   58    3    8  540    3
   58 

In [21]:
embedding_vector_length = 32
model = Sequential() 
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid')) 
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])  
print(model.summary()) 

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 32)           2969984   
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 200, 32)          0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 50)                16600     
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 2,986,635
Trainable params: 2,986,635
Non-trainable params: 0
____________________________________________

In [22]:
history = model.fit(padded_sequence,sentiment_label[0],validation_split=0.2, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
def predict_sentiment(text):
    tw = tokenizer.texts_to_sequences([text])
    tw = pad_sequences(tw,maxlen=200)
    prediction = int(model.predict(tw).round().item())
    print("Predicted label: ", sentiment_label[1][prediction])

In [30]:
test_sentence1 = "You are soo beautiful"
predict_sentiment(test_sentence1)

test_sentence2 = "You creep, just get lost from here!"
predict_sentiment(test_sentence2)

Predicted label:  Positive
Predicted label:  Negative
