In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.SnowballStemmer("english")

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
data =pd.read_csv("Reviews.csv")

In [55]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Sentiment
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,Positive
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Negative
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,Positive
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,Negative
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,Positive


In [5]:
data['Text'].values[0]

'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'

In [6]:
data['Sentiment']=pd.cut(data['Score'], bins=[0,2,3,5], labels=['Negative', 'Nuetral', 'Positive'])

In [7]:
def clean(text):
    text=str(text).lower()
     # Remove text within square brackets
    text=re.sub('\[.*?\]','',text)
    # Remove URLs
    text=re.sub('https?://\S+|www\.\S+','',text)
    # Remove HTML tags
    text = re.sub('<.*?>+', '', text)
    #  Removing Punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Replace newline characters with spaces
    text = re.sub('\n', '', text)
    # Remove words containing digits
    text = re.sub('\w*\d\w*', '', text)
     # Tokenize the text and remove stopwords
    text = [word for word in text.split(' ') if word not in stopwords]
    text=" ".join(text)
    # Stem the remaining words
    text = [stemmer.stem(word) for word in text.split(' ')]
    # Join the cleaned and stemmed words back into a string
    text=" ".join(text)
    return text

In [8]:
clean(data['Text'])

'       bought sever vital can        product arriv label jumbo salt        confect around        look secret ingredi        great taffi great price  wid                                                          sleepytim tea one tea     valerian chamomil blend     wonder producti drink     product great  daughter                                                   nannam text length  dtype object'

In [9]:
data_cleaned=pd.DataFrame()
data_cleaned['Text']=data["Text"].apply(clean)
data_cleaned['Score']=data['Sentiment']
data_cleaned

Unnamed: 0,Text,Score
0,bought sever vital can dog food product found ...,Positive
1,product arriv label jumbo salt peanutsth peanu...,Negative
2,confect around centuri light pillowi citrus g...,Positive
3,look secret ingredi robitussin believ found g...,Negative
4,great taffi great price wide assort yummi taf...,Positive
...,...,...
6258,sleepytim tea one tea alway count mellow great...,Positive
6259,valerian chamomil blend spearmint leav lemon g...,Positive
6260,wonder producti drink time day obvious origin ...,Positive
6261,product great daughter extrem pick love also ...,Positive


In [10]:
text=data_cleaned['Text'].values

In [11]:
max_features=5000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(text)
# tokenizes the input text data into sequences of integer indices
sequences = tokenizer.texts_to_sequences(text)

In [12]:
x=sequences
x= pad_sequences(x)

In [14]:
x

array([[   0,    0,    0, ...,  921,    5,   35],
       [   0,    0,    0, ..., 2455,    5, 3885],
       [   0,    0,    0, ..., 1080, 1356, 4709],
       ...,
       [   0,    0,    0, ...,    9,   45, 4363],
       [   0,    0,    0, ...,   99, 2418,  389],
       [   0,    0,    0, ...,    0,    0,    0]], dtype=int32)

In [13]:
y=data_cleaned['Score']

In [15]:
y

0       Positive
1       Negative
2       Positive
3       Negative
4       Positive
          ...   
6258    Positive
6259    Positive
6260    Positive
6261    Positive
6262    Positive
Name: Score, Length: 6263, dtype: category
Categories (3, object): ['Negative' < 'Nuetral' < 'Positive']

In [16]:
tokens = []
for sentence in text:
    tokens.extend(sentence.split())

In [17]:
word_count = {}
for token in tokens:
    if token in word_count:
        word_count[token] += 1
    else:
        word_count[token] = 1

In [18]:
y=pd.get_dummies(data_cleaned['Score']).values
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size = 0.33, random_state = 42)

In [29]:
y

array([[False, False,  True],
       [ True, False, False],
       [False, False,  True],
       ...,
       [False, False,  True],
       [False, False,  True],
       [False, False,  True]])

In [22]:
vocab_size = len(word_count)

In [23]:
model = Sequential([
    tf.keras.layers.Embedding(vocab_size, output_dim=50),
    tf.keras.layers.LSTM(50),
    tf.keras.layers.Dense(3, activation='softmax')
])

In [24]:
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [25]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          734050    
                                                                 
 lstm (LSTM)                 (None, 50)                20200     
                                                                 
 dense (Dense)               (None, 3)                 153       
                                                                 
Total params: 754403 (2.88 MB)
Trainable params: 754403 (2.88 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [26]:
model.fit(X_train, Y_train,epochs = 15,batch_size=100,verbose = 2)

Epoch 1/15
42/42 - 11s - loss: 0.7888 - accuracy: 0.7519 - 11s/epoch - 257ms/step
Epoch 2/15
42/42 - 7s - loss: 0.6008 - accuracy: 0.7695 - 7s/epoch - 159ms/step
Epoch 3/15
42/42 - 5s - loss: 0.4449 - accuracy: 0.8258 - 5s/epoch - 108ms/step
Epoch 4/15
42/42 - 6s - loss: 0.3373 - accuracy: 0.8699 - 6s/epoch - 134ms/step
Epoch 5/15
42/42 - 5s - loss: 0.2611 - accuracy: 0.8954 - 5s/epoch - 107ms/step
Epoch 6/15
42/42 - 3s - loss: 0.2052 - accuracy: 0.9130 - 3s/epoch - 76ms/step
Epoch 7/15
42/42 - 4s - loss: 0.1662 - accuracy: 0.9321 - 4s/epoch - 101ms/step
Epoch 8/15
42/42 - 3s - loss: 0.1248 - accuracy: 0.9626 - 3s/epoch - 79ms/step
Epoch 9/15
42/42 - 2s - loss: 0.0860 - accuracy: 0.9769 - 2s/epoch - 51ms/step
Epoch 10/15
42/42 - 3s - loss: 0.0588 - accuracy: 0.9859 - 3s/epoch - 80ms/step
Epoch 11/15
42/42 - 2s - loss: 0.0485 - accuracy: 0.9883 - 2s/epoch - 47ms/step
Epoch 12/15
42/42 - 2s - loss: 0.0275 - accuracy: 0.9948 - 2s/epoch - 56ms/step
Epoch 13/15
42/42 - 1s - loss: 0.0203 - a

<keras.src.callbacks.History at 0x79bcf007bc70>

In [None]:
x.shape

In [64]:
new_review = "This product is amazing! I love it."

In [73]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts([new_review])
sequences = tokenizer.texts_to_sequences([new_review])


max_sequence_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

In [66]:
predictions = model.predict(padded_sequences)

# Assuming your model predicts probabilities for each class, you can get the predicted class
predicted_class = np.argmax(predictions)

# You can also get the probability score for each class
probability_scores = predictions[0]



In [67]:
predictions = model.predict(padded_sequences)



In [68]:
predictions

array([[0.00197041, 0.00305942, 0.99497014]], dtype=float32)

In [69]:
labels=["Negative","Neutral","Positive"]

In [70]:
label=labels[predictions.argmax()]

In [71]:
label

'Positive'