In [None]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from sklearn.preprocessing import LabelEncoder

In [None]:
data = pd.read_csv('/content/Sentiment.csv')
data.head()

In [None]:
print(data.groupby('sentiment').nunique())

In [None]:
# Keeping only the neccessary columns
data = data[['text','sentiment']]
data.head()

In [None]:
print(data['text'][0])
print(data['text'][1])
print(data['text'][2])

In [None]:
data['text'] = data['text'].apply(lambda x: x.lower()) # all string to lowercase Read more about lambda() "https://realpython.com/python-lambda/"
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x))) # using regular expression preprocess the text by removing everything that is not [a-zA-z0-9\s]

In [None]:
print(data['text'][0])
print(data['text'][1])
print(data['text'][2])

In [None]:
for idx, row in data.iterrows(): # Iterate over DataFrame rows as (index, Series) pairs.
    row[0] = row[0].replace('rt', '') # replace/ remove 'rt' in the start of the text

In [None]:
print(data['text'][0])
print(data['text'][1])
print(data['text'][2])

In [None]:
max_fatures = 2000

tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)

In [None]:
# Transforms each text in texts to a sequence of integers.
# Only top num_words-1 most frequent words will be taken into account. Only words known by the tokenizer will be taken into account.

# Transforms each text in texts to a sequence of integers. 
# So it basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary.
X = tokenizer.texts_to_sequences(data['text'].values)

In [None]:
# tokenizer.get_config()
# Returns the tokenizer configuration as Python dictionary.
print(tokenizer.get_config().keys())

# word_counts: A dictionary of words and their counts.
print("\ntokenizer.word_counts") 
print(tokenizer.word_counts) 

# document_count:An integer count of the total number of documents that were used to fit the Tokenizer.
print("\ntokenizer.document_count")
print(tokenizer.document_count) 

# word_index: A dictionary of words and their uniquely assigned integers.
print("\ntokenizer.word_index")
print(tokenizer.word_index)

# word_docs: A dictionary of words and how many documents each appeared in.
print("\ntokenizer.word_docs")
print(tokenizer.word_docs)



print('\nLen() of X:', len(X))
print('\n', X[:2])

In [None]:
# Check the sequence of the text, Do we need to Pad ??
for i in range(4):
  print(X[i])
  print('len=', len(X[i]))

In [None]:
X = pad_sequences(X) # Pads sequences to the same length.
print('X.shape = ', X.shape)

In [None]:
# Check the sequence after padding, Which padding pre or post??
for i in range(4):
  print(X[i])
  print('len=', len(X[i]))

In [None]:
embed_dim = 128
lstm_out = 196

def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim, input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    model.summary()
    return model

In [None]:
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)

X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [None]:
model = createmodel()

In [None]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 1)
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
print(score)
print(acc)
print(model.metrics_names)