<a href="https://colab.research.google.com/github/abhinav-bagwari/Twitter-Sentiment-Analylsis-using-LSTM/blob/main/Twitter_Sentiment_Analylsis_using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
## Word2Vec
import gensim

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/twitter.csv",encoding = "ISO-8859-1")

In [None]:
df.columns = ["target","Id","Date","Flag","Name","text"]
df.drop(["Id","Date","Flag","Name"],axis=1,inplace=True)
df.head()

Unnamed: 0,target,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1599999 non-null  int64 
 1   text    1599999 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [None]:
df['target'].value_counts()

4    800000
0    799999
Name: target, dtype: int64

In [None]:
df = df.replace(to_replace =4,value =1)

In [None]:
df['target'].value_counts()

1    800000
0    799999
Name: target, dtype: int64

1: POSITIVE
0: NEGATIVE

**Preprocessing the dataset**

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
from nltk.stem import PorterStemmer
ps = PorterStemmer

In [None]:
def preprocess(text, stem=False):
    text = re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(ps.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
df.text = df.text.apply(lambda x: preprocess(x))
df.head()

Unnamed: 0,target,text
0,0,upset update facebook texting might cry result...
1,0,dived many times ball managed save 50 rest go ...
2,0,whole body feels itchy like fire
3,0,behaving mad see
4,0,whole crew


In [None]:
review_text = df.text.apply(gensim.utils.simple_preprocess)

In [None]:
review_text

0          [upset, update, facebook, texting, might, cry,...
1          [dived, many, times, ball, managed, save, rest...
2                    [whole, body, feels, itchy, like, fire]
3                                       [behaving, mad, see]
4                                              [whole, crew]
                                 ...                        
1599994                  [woke, school, best, feeling, ever]
1599995     [thewdb, com, cool, hear, old, walt, interviews]
1599996                [ready, mojo, makeover, ask, details]
1599997    [happy, th, birthday, boo, alll, time, tupac, ...
1599998    [happy, charitytuesday, thenspcc, sparkscharit...
Name: text, Length: 1599999, dtype: object

In [None]:
model = gensim.models.Word2Vec(
    size = 300,
    window=10,
    min_count=1,
    workers=4,
)

In [None]:
model.build_vocab(review_text, progress_per=1000)

In [None]:
words = model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 310936


In [None]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(52369537, 55643455)

In [None]:
model.most_similar("good")

  """Entry point for launching an IPython kernel.


[('great', 0.6487588882446289),
 ('bad', 0.6015034317970276),
 ('tough', 0.5765929222106934),
 ('rough', 0.5638595223426819),
 ('excellent', 0.553757905960083),
 ('terrible', 0.54606032371521),
 ('decent', 0.5357052683830261),
 ('horrible', 0.5285310745239258),
 ('shitty', 0.5197687149047852),
 ('awful', 0.5162680149078369)]

In [None]:
## train_test_split
from sklearn.model_selection import train_test_split
train, test= train_test_split(df, test_size=0.3,random_state=42)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, LSTM
from keras import utils


In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 266736


In [None]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train.text), maxlen=300)
X_test = pad_sequences(tokenizer.texts_to_sequences(test.text), maxlen=300)

In [None]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train: (1119999, 300)
Shape of X_test: (480000, 300)


In [None]:
y_train,y_test = train_test_split(df['target'].values,test_size=0.3,random_state=42)

In [None]:
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [None]:
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of y_train: (1119999, 1)
Shape of y_test: (480000, 1)


In [None]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
  if word in model.wv:
    embedding_matrix[i] =model.wv[word]
print(embedding_matrix.shape)

(266736, 300)


 **Building Model**

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=300))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])


In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 300)          80020800  
_________________________________________________________________
dropout_2 (Dropout)          (None, 300, 300)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 80,181,301
Trainable params: 80,181,301
Non-trainable params: 0
_________________________________________________________________


In [None]:
final = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=2,
                    validation_split=0.1,
                    verbose=1,
                     )

Epoch 1/2
Epoch 2/2

**Performance Metrics And Accuracy**

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)