### Lets Import Necessary Modules

In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Load Our Dataset

df  = pd.read_csv('sentiment analysis.csv',encoding='latin1')

In [3]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


***Text Preprocessing And Handiling Imbalance Data(Over sampling)***

In [4]:
df.dropna(inplace=True)

In [5]:
df.replace({'neutral':0,'negative':1,'positive':2},inplace=True)

In [6]:
df0_counts,df1_counts,df2_counts = df.sentiment.value_counts()

df_over2 = df[df['sentiment']==2]
df_over0 = df[df['sentiment']==0]
df_over1 = df[df['sentiment']==1]

dfover_2 = df_over2.sample(df0_counts,replace=True)
dfover_1 = df_over1.sample(df0_counts,replace=True)

df = pd.concat([dfover_2,dfover_1,df_over0],axis=0)

In [7]:
df.sentiment.value_counts()

2    11117
1    11117
0    11117
Name: sentiment, dtype: int64

In [8]:
text_message = df['selected_text'].str.lower()

In [9]:
stemming = PorterStemmer()

***Lowering and Removing unwananted special character***

In [10]:
corpus = []
# Assuming df is your DataFrame
for i in text_message:
    data = i
    text = re.sub('[^a-zA-Z]',' ',str(data))
    tex = text.lower()
    text = text.strip()
#     sen = nltk.word_tokenize(text)
#     text = [stemming.stem(word) for word in sen if word not in stopwords.words('english')]
#     text = ' '.join(text)
    corpus.append(text)

In [13]:
len(corpus)

33351

In [14]:
corpus[0]

'all systems up and running smoothly'

**One Hot Encoding**

In [15]:
vocab_size = 5000

In [16]:
onehot_rep = [one_hot(i,vocab_size) for i in corpus]

In [104]:
onehot_rep[9]

[1110, 3640]

***Pad Sequence For Fixed Length***

In [18]:
max_length = 20

embedding_docs = pad_sequences(onehot_rep,padding='post',maxlen=max_length)

In [19]:
embedding_docs.shape

(33351, 20)

### Model Building

In [21]:
embedding_vector_features = 100

model = Sequential([
    Embedding(vocab_size,embedding_vector_features,input_length=max_length),
    LSTM(200,return_sequences=True),
    LSTM(100),
    Dense(3,activation='softmax')
])

In [22]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 100)           500000    
                                                                 
 lstm_3 (LSTM)               (None, 20, 200)           240800    
                                                                 
 lstm_4 (LSTM)               (None, 100)               120400    
                                                                 
 dense_1 (Dense)             (None, 3)                 303       
                                                                 
Total params: 861503 (3.29 MB)
Trainable params: 861503 (3.29 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


***compile our model***

In [23]:
model.compile(optimizer='adam',
             loss=tf.keras.losses.SparseCategoricalCrossentropy(),
             metrics=['acc'])

callback = tf.keras.callbacks.EarlyStopping(monitor='acc',patience=2)

**split into train and test**

In [24]:
x = np.array(embedding_docs)
y = np.array(df['sentiment'])

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [25]:
len(x_train),len(y_train)

(26680, 26680)

In [26]:
x_train.shape,y_train.shape

((26680, 20), (26680,))

**train our model**

In [27]:
histroy = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=20,callbacks=[callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


**Save our model**

In [28]:
model.save('sentiment analysis LSTM_v1.keras')

**Evaluate our model**

In [31]:
model.evaluate(x_test,y_test)



[0.6985680460929871, 0.8818767666816711]

In [79]:
class_name = ['netural','negative','positive']

**Lets test and play with our model**

In [98]:
text = ['why your are bullying me']
onehot_re = [one_hot(i,vocab_size) for i in text]

embedding_docs = pad_sequences(onehot_re,padding='post',maxlen=max_length)

In [99]:
embedding_docs

array([[2925, 4459, 3925, 2599, 2325,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]])

In [100]:
class_name[np.argmax(model.predict(embedding_docs))]



'negative'

Finally the performance of our model is well