In [1]:
"""
Following this kaggle
https://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras/data?select=Sentiment.csv

But undersampling the majority class to have 
better positive sentiment tweet accuracy
"""

'\nFollowing this kaggle\nhttps://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras/data?select=Sentiment.csv\n\nBut undersampling the majority class to have \nbetter positive sentiment tweet accuracy\n'

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [3]:
data = pd.read_csv('./kaggle/Sentiment.csv')

data = data[['text','sentiment']]
data = data[data.sentiment != "Neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

# Divide by two because # of tweets is multiplied by number of columns
print("Number of Positive Tweets: ", (data[ data['sentiment'] == 'Positive'].size)/2) 
print("Number of Negative Tweets: ",(data[ data['sentiment'] == 'Negative'].size)/2)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')

Number of Positive Tweets:  2236.0
Number of Negative Tweets:  8493.0


In [4]:
#import sys
#!{sys.executable} -m pip install imblearn

In [5]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [6]:
(unique, counts) = np.unique(data['sentiment'].values, return_counts = True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)


[['Negative' 8493]
 ['Positive' 2236]]


In [7]:
# Randomly undersample 
resampled_neg = data[data['sentiment']== 'Negative'].sample(n=2236, replace = True)
#resampled.columns
#resampled_neg.shape
resampled = pd.concat([data[data['sentiment'] == 'Positive'], resampled_neg])
resampled.columns


Index(['text', 'sentiment'], dtype='object')

In [8]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(resampled['text'].values)
X = tokenizer.texts_to_sequences(resampled['text'].values)
X = pad_sequences(X)

In [10]:
#number = LabelEncoder()
Y = pd.get_dummies(resampled['sentiment']).values

counts = [np.count_nonzero(Y[:,0] == 1), np.count_nonzero(Y[:,1] == 1)]
labels = ['Negative', 'Positive']
frequencies = np.asarray((labels,counts)).T
print(frequencies)

[['Negative' '2236']
 ['Positive' '2236']]


In [14]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 29, 128)           256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 29, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
# Make Train and Test datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 42)
#Y_train = Y_train.reshape((2996,2))
#Y_test = Y_test.reshape((1476,1))
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(3354, 29) (3354, 2)
(1118, 29) (1118, 2)


In [17]:
batch = 32
model.fit(X_train, Y_train, epochs = 10, batch_size=batch, verbose = 2)

Epoch 1/10
105/105 - 17s - loss: 0.5989 - accuracy: 0.6503
Epoch 2/10
105/105 - 14s - loss: 0.4234 - accuracy: 0.8059
Epoch 3/10
105/105 - 15s - loss: 0.3346 - accuracy: 0.8581
Epoch 4/10
105/105 - 15s - loss: 0.2687 - accuracy: 0.8962
Epoch 5/10
105/105 - 14s - loss: 0.2244 - accuracy: 0.9138
Epoch 6/10
105/105 - 14s - loss: 0.1997 - accuracy: 0.9219
Epoch 7/10
105/105 - 14s - loss: 0.1752 - accuracy: 0.9353
Epoch 8/10
105/105 - 15s - loss: 0.1593 - accuracy: 0.9407
Epoch 9/10
105/105 - 15s - loss: 0.1379 - accuracy: 0.9523
Epoch 10/10
105/105 - 15s - loss: 0.1300 - accuracy: 0.9529


<tensorflow.python.keras.callbacks.History at 0x24fc3dee940>

In [18]:
# save model 
model.save('./output/models/LSTM_balanced_10')

# to load and check model:

# load_model = tf.keras.models.load_model('./output/models/LSTM_unbalanced')
# load_model.summary()

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ./output/models/LSTM_balanced_10\assets


In [19]:
y_pred = model.predict(X_test)


In [20]:
print('roc_auc_score', roc_auc_score(Y_test, y_pred))

roc_auc_score 0.8391763796994409
