In [1]:
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
import matplotlib.pyplot as plt

# Check for TensorFlow GPU access
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

# See TensorFlow version
print(f"TensorFlow version: {tf.__version__}")

TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.9.1


In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Embedding
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import pickle

In [13]:
df = pd.read_csv('train.tsv', header=0, delimiter="\t", quoting=3)

In [14]:
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [15]:
df = df[['Phrase','Sentiment']]

In [16]:
df.shape

(156060, 2)

In [17]:
df.Sentiment.value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [18]:
df['Phrase'] = df['Phrase'].apply(lambda x: x.lower())
df['Phrase'] = df['Phrase'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Phrase'] = df['Phrase'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Phrase'] = df['Phrase'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))


In [19]:
max_features = 2500
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df['Phrase'].values)
X = tokenizer.texts_to_sequences(df['Phrase'].values)
X = pad_sequences(X)
X.shape

(156060, 46)

In [24]:
embed_dim = 50
model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(LSTM(10))
model.add(Dense(5,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer="adam",metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 46, 50)            125000    
                                                                 
 lstm_1 (LSTM)               (None, 10)                2440      
                                                                 
 dense_1 (Dense)             (None, 5)                 55        
                                                                 
Total params: 127,495
Trainable params: 127,495
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
y = pd.get_dummies(df['Sentiment']).values
df_test= pd.read_csv('test.tsv', header=0, delimiter="\t", quoting=3)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 425)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(117045, 46) (117045, 5)
(39015, 46) (39015, 5)


In [25]:
model.fit(X_train, y_train, epochs = 5, verbose = 1)

Epoch 1/5


2022-06-04 17:42:21.924653: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-06-04 17:42:22.145079: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-06-04 17:42:24.709224: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x16dc32ee0>

In [35]:
test = ['you are so handsome']
test = tokenizer.texts_to_sequences(test)
test = pad_sequences(test, maxlen=X.shape[1], dtype="int32", value=0)
print(test.shape)
sentiment = model.predict(test)[0]
# print(np.argmax(sentiment))

if(np.argmax(sentiment) == 0):
    print("Negative")
elif (np.argmax(sentiment) == 1):
    print("Somewhat Negative")
elif (np.argmax(sentiment) == 2):
    print("Neutral")
elif (np.argmax(sentiment) == 3):
    print("Somewhat Positive")
elif (np.argmax(sentiment) == 4):
    print("Positive")

(1, 46)
Somewhat Positive


In [36]:
with open('tokenizer.pickle', 'wb') as tk:
    pickle.dump(tokenizer, tk, protocol=pickle.HIGHEST_PROTOCOL)
model_json = model.to_json()
with open("lstm_model.json", "w") as js:
    js.write(model_json)

In [37]:
model.save_weights("lstm_model.h5")