In [2]:
pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow
  Downloading tensorflow-2.11.0-cp39-cp39-win_amd64.whl (1.9 kB)
Collecting tensorflow-intel==2.11.0
  Downloading tensorflow_intel-2.11.0-cp39-cp39-win_amd64.whl (266.3 MB)
Collecting termcolor>=1.1.0
  Downloading termcolor-2.2.0-py3-none-any.whl (6.6 kB)
Collecting gast<=0.4.0,>=0.2.1
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1
  Downloading tensorflow_io_gcs_filesystem-0.30.0-cp39-cp39-win_amd64.whl (1.5 MB)
Collecting absl-py>=1.0.0
  Downloading absl_py-1.4.0-py3-none-any.whl (126 kB)
Collecting opt-einsum>=2.3.2
  Using cached opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting tensorflow-estimator<2.12,>=2.11.0
  Downloading tensorflow_estimator-2.11.0-py2.py3-none-any.whl (439 kB)
Collecting flatbuffers>=2.0
  Downloading flatbuffers-23.1.21-py2.py3-none-any.whl (26 kB)
Collecting keras<2.12,>=2.11.0
  Downloading keras-2.1



In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the Disneyland reviews dataset
reviews_df = pd.read_csv('DisneylandReviews.csv',encoding='ISO-8859-1')
#
# Remove irrelevant columns
#reviews_df = reviews_df[['Review Text', 'Rating']]

# Remove missing values
reviews_df = reviews_df.dropna()

# Remove punctuations and convert to lowercase
def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text

reviews_df['Review_Text'] = reviews_df['Review_Text'].apply(preprocess_text)

# Tokenize the reviews
reviews_df['Tokenized'] = reviews_df['Review_Text'].apply(word_tokenize)

# Remove stop words
stop_words = set(stopwords.words('english'))
def remove_stop_words(tokens):
    return [word for word in tokens if not word in stop_words]

reviews_df['Tokenized'] = reviews_df['Tokenized'].apply(remove_stop_words)

# Convert ratings to sentiment labels
def convert_rating_to_label(rating):
    if rating >= 4:
        return 'positive'
    elif rating <= 2:
        return 'negative'
    else:
        return 'neutral'

reviews_df['Sentiment'] = reviews_df['Rating'].apply(convert_rating_to_label)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    reviews_df['Tokenized'], reviews_df['Sentiment'], test_size=0.2, random_state=42)

# Create word embeddings
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to a fixed length
max_seq_length = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length, padding='post', truncating='post')

# Build the LSTM model
embedding_size = 100
model = Sequential()
model.add(Embedding(max_words, embedding_size, input_length=max_seq_length))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Convert sentiment labels to one-hot encoding
y_train_enc = pd.get_dummies(y_train)
y_test_enc = pd.get_dummies(y_test)

# Train the model
batch_size = 64
epochs = 20
model.fit(X_train_pad, y_train_enc, validation_data=(X_test_pad, y_test_enc), batch_size=batch_size, epochs=epochs)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test_enc)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aniruh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aniruh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Loss: 0.5155259370803833
Accuracy: 0.8198546767234802


In [2]:
# Classify new reviews
def classify_review(review_text):
    # Preprocess the review
    review_text = preprocess_text(review_text)
    tokens = word_tokenize(review_text)
    tokens = remove_stop_words(tokens)
    sequence = tokenizer.texts_to_sequences([tokens])[0]
    padded = pad_sequences([sequence], maxlen=max_seq_length, padding='post', truncating='post')

    # Predict the sentiment
    sentiment_probs = model.predict(padded)[0]
    sentiment_label = np.argmax(sentiment_probs)
    if sentiment_label == 0:
        return 'negative'
    elif sentiment_label == 1:
        return 'neutral'
    else:
        return 'positive'

In [3]:
# Define a new review
new_review = "I had an amazing time at Disneyland! The rides were so much fun and the atmosphere was magical."

# Preprocess the review
new_review = preprocess_text(new_review)
new_review_tokens = word_tokenize(new_review)
new_review_tokens = remove_stop_words(new_review_tokens)
new_review_seq = tokenizer.texts_to_sequences([new_review_tokens])
new_review_pad = pad_sequences(new_review_seq, maxlen=max_seq_length, padding='post', truncating='post')

# Make a prediction using the trained model
prediction = model.predict(new_review_pad)

# Get the predicted sentiment label
sentiment_labels = ['negative', 'neutral', 'positive']
predicted_label = sentiment_labels[np.argmax(prediction)]

# Print the predicted label
print(f"Predicted sentiment label: {predicted_label}")


Predicted sentiment label: positive
