In [1]:
# Download sentiment dataset and extract it for use in training and testing
!wget "https://www.cs.jhu.edu/~mdredze/datasets/sentiment/domain_sentiment_data.tar.gz"
!tar -xzf "/content/domain_sentiment_data.tar.gz"

import os
import numpy as np
import matplotlib.pyplot as plt
import re
from pickle import dump, load
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import gensim
import gensim.downloader as api
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Input, Dense, Embedding, Dropout, BatchNormalization, GRU, Attention
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix
from flask import Flask, request, render_template


# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt_tab')

--2024-12-03 15:40:11--  https://www.cs.jhu.edu/~mdredze/datasets/sentiment/domain_sentiment_data.tar.gz
Resolving www.cs.jhu.edu (www.cs.jhu.edu)... 128.220.13.64
Connecting to www.cs.jhu.edu (www.cs.jhu.edu)|128.220.13.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30586147 (29M) [application/x-gzip]
Saving to: ‘domain_sentiment_data.tar.gz.2’


2024-12-03 15:40:13 (20.4 MB/s) - ‘domain_sentiment_data.tar.gz.2’ saved [30586147/30586147]



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Function to clean sentences
def clean_sentence(sentence: str) -> list:
    """
    Cleans the given sentence by removing HTML tags, URLs, emails, punctuation, and stopwords.

    Args:sentence (str): The sentence to be cleaned.

    Returns:list: A list of cleaned words from the input sentence.
    """
    # Remove HTML tags
    tags = re.compile(r"<.*?>")
    sentence = re.sub(tags, '', sentence)
    # Convert to lowercase
    sentence = sentence.lower()
    # Remove URLs and email addresses
    email_urls = re.compile(r"\bhttp\S+|\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\b")
    sentence = re.sub(email_urls, '', sentence)
    # Replace '@' with 'a'
    ats = re.compile(r'@')
    sentence = re.sub(ats, 'a', sentence)
    # Remove punctuation
    punc = re.compile(r"[^\w\s-]")
    sentence = re.sub(punc, '', sentence)
    # Tokenize the sentence
    sentence = word_tokenize(sentence)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    sentence = [word for word in sentence if word not in stop_words]
    return sentence

In [3]:
# Function to read training and testing data
def read_data(folders, path, regex_review):
    """
    Reads data from the specified folders and extracts sentences for sentiment analysis.

    Args:folders (list): List of folder names containing data.
        path (str): Base path to the data folders.
        regex_review (Pattern): Regular expression pattern to extract review text.

    Returns:tuple: A tuple containing lists of data and corresponding labels.
    """
    data, labels = [], []
    for folder in folders:
        for sentiment, label in [("negative", 0), ("positive", 1)]:
            file_path = os.path.join(path, folder, f"{sentiment}.review")
            with open(file_path, 'r') as file:
                sentences = re.findall(regex_review, file.read())
                cleaned_sentences = [clean_sentence(sentence) for sentence in sentences]
                data.extend(cleaned_sentences)
                labels.extend([label] * len(cleaned_sentences))
    return data, labels

# Read training data
folders_train = ["books", "dvd", "electronics"]
print('Reading Train Data')
x_train, y_train = read_data(folders_train, "/content/sorted_data_acl/", re.compile(r"<review_text>.*?</review_text>", flags=re.DOTALL))

# Read testing data
folders_test = ["kitchen_&_housewares"]
print('Reading Test Data')
x_test, y_test = read_data(folders_test, "/content/sorted_data_acl/", re.compile(r"<review_text>.*?</review_text>", flags=re.DOTALL))

# Remove short or meaningless reviews (outlier removal)
x_train, y_train = zip(*[(x, y) for x, y in zip(x_train, y_train) if len(x) > 3])
x_test, y_test = zip(*[(x, y) for x, y in zip(x_test, y_test) if len(x) > 3])

# Balance the dataset
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
x_train, y_train = ros.fit_resample(np.array(x_train, dtype=object).reshape(-1, 1), y_train)
x_train = x_train.ravel()

# Save the preprocessed training and testing data to disk for future use
with open('/content/x_train', 'wb') as file:
    dump(x_train, file)
with open('/content/y_train', 'wb') as file:
    dump(y_train, file)
with open('/content/x_test', 'wb') as file:
    dump(x_test, file)
with open('/content/y_test', 'wb') as file:
    dump(y_test, file)

Reading Train Data
Reading Test Data


In [4]:
# Create a vocabulary from the training data
vocab = set(word for sentence in x_train for word in sentence)
word2id = {word: idx for idx, word in enumerate(vocab)}
id2word = {idx: word for idx, word in enumerate(vocab)}
# Assign a dummy value for unknown words
dummy = len(word2id)

In [5]:
# Encode and pad sequences
def encode_sentence(old_sentence):
    """
    Encodes a sentence into a list of integers based on the word2id mapping.

    Args:old_sentence (list): List of words from a sentence to be encoded.

    Returns:list: A list of integers representing the encoded words.
    """
    return [word2id.get(word, dummy) for word in old_sentence]

# Encode the training and testing sentences
x_train_encoded = [encode_sentence(sentence) for sentence in x_train]
x_test_encoded = [encode_sentence(sentence) for sentence in x_test]
# Pad the sequences to a fixed length
MAX_SEQ_LEN = 125
x_train_padded = pad_sequences(x_train_encoded, maxlen=MAX_SEQ_LEN, dtype='int32', padding='post')
x_test_padded = pad_sequences(x_test_encoded, maxlen=MAX_SEQ_LEN, dtype='int32', padding='post')

# Load GloVe pre-trained embeddings from gensim
print('Loading pre-trained word embeddings')
glove = api.load('glove-twitter-200')
embedding_matrix = np.zeros((len(vocab) + 1, glove.vector_size))
for word, idx in word2id.items():
    try:
        embedding_matrix[idx] = glove[word]
    except KeyError:
        embedding_matrix[idx] = np.zeros(glove.vector_size)

Loading pre-trained word embeddings


In [6]:
# Build the LSTM model for sentiment analysis with optimizations
print('Defining LSTM model')
model = Sequential()
model.add(Embedding(input_dim=len(vocab) + 1, output_dim=glove.vector_size, input_length=MAX_SEQ_LEN, weights=[embedding_matrix], trainable=False))
model.add(GRU(128, return_sequences=True))
model.add(Dropout(0.4))
model.add(BatchNormalization())
model.add(GRU(128))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Defining LSTM model




In [7]:
# Split the training data into training and validation sets
print('Splitting data into training and validation sets')
x_train_padded, x_val_padded, y_train, y_val = train_test_split(x_train_padded, y_train, test_size=0.2, random_state=42)

Splitting data into training and validation sets


In [8]:
# Train the LSTM model with EarlyStopping to prevent overfitting
print('Training the model')
y_train = np.array(y_train).reshape(-1, 1)
y_val = np.array(y_val).reshape(-1, 1)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-5)
model.fit(x_train_padded, y_train, validation_data=(x_val_padded, y_val), batch_size=16, epochs=100, callbacks=[early_stopping, reduce_lr])

Training the model
Epoch 1/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 293ms/step - accuracy: 0.5147 - loss: 0.7179 - val_accuracy: 0.5008 - val_loss: 0.7057 - learning_rate: 0.0010
Epoch 2/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 257ms/step - accuracy: 0.5533 - loss: 0.6742 - val_accuracy: 0.7083 - val_loss: 0.5408 - learning_rate: 0.0010
Epoch 3/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 246ms/step - accuracy: 0.7894 - loss: 0.4795 - val_accuracy: 0.7575 - val_loss: 0.5691 - learning_rate: 0.0010
Epoch 4/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 266ms/step - accuracy: 0.8225 - loss: 0.4175 - val_accuracy: 0.7567 - val_loss: 0.5314 - learning_rate: 0.0010
Epoch 5/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 248ms/step - accuracy: 0.8546 - loss: 0.3643 - val_accuracy: 0.7442 - val_loss: 0.5996 - learning_rate: 0.0010
Epoch 6/100
[1m300/300[0m [

<keras.src.callbacks.history.History at 0x792df7e236d0>

In [9]:
# Evaluate the model on the test data
y_test = np.array(y_test).reshape(-1, 1)
y_test = y_test.astype('float32')
loss, accuracy = model.evaluate(x_test_padded, y_test)
print(f'Test accuracy: {accuracy * 100:.2f}%')

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 128ms/step - accuracy: 0.8437 - loss: 0.3684
Test accuracy: 79.00%


In [10]:
# Print classification report and confusion matrix
print('Classification Report:')
y_pred = (model.predict(x_test_padded) > 0.5).astype('int32')
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Classification Report:
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 99ms/step
              precision    recall  f1-score   support

    Negative       0.75      0.87      0.81      1000
    Positive       0.84      0.71      0.77      1000

    accuracy                           0.79      2000
   macro avg       0.80      0.79      0.79      2000
weighted avg       0.80      0.79      0.79      2000

Confusion Matrix:
[[867 133]
 [287 713]]


In [14]:
# Custom prediction function
def lstm_predict():
    """
    Takes user input, preprocesses it, and uses the trained LSTM model to predict the sentiment.

    Returns:
        tuple: The sentiment result and confidence score.
    """
    sentence = input("Enter a sentence to assess its sentiment: ")
    ready_sentence = pad_sequences([encode_sentence(clean_sentence(sentence))], maxlen=MAX_SEQ_LEN, dtype='int32', padding='post')
    score = model.predict(ready_sentence)[0][0]
    confidence = score if score >= 0.5 else 1 - score
    result = "Positive Review" if score >= 0.5 else "Negative Review"
    confidence_str = "High" if confidence > 0.75 else ("Medium" if confidence > 0.5 else "Low")
    print(f"{result} (Score: {score:.2f}, Confidence: {confidence:.2f}, Confidence Level: {confidence_str})")
    return result, confidence

# Prompt the user to enter a sentence and predict its sentiment
lstm_predict()

Enter a sentence to assess its sentiment: it is a bad product
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Negative Review (Score: 0.01, Confidence: 0.99, Confidence Level: High)


('Negative Review', 0.9867006251588464)

In [15]:
lstm_predict()

Enter a sentence to assess its sentiment: it is a good product
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
Positive Review (Score: 0.91, Confidence: 0.91, Confidence Level: High)


('Positive Review', 0.9146273)

In [12]:
# Flask web application for sentiment analysis
app = Flask(__name__)

@app.route('/')
def home():
    """Renders the homepage with a form to input text."""
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    """
    Handles the form submission, processes the input text, and returns the sentiment prediction.

    Returns:
        str: Rendered HTML template with prediction result.
    """
    if request.method == 'POST':
        sentence = request.form['sentence']
        ready_sentence = pad_sequences([encode_sentence(clean_sentence(sentence))], maxlen=MAX_SEQ_LEN, dtype='int32', padding='post')
        score = model.predict(ready_sentence)[0][0]
        confidence = score if score >= 0.5 else 1 - score
        result = "Positive Review" if score >= 0.5 else "Negative Review"
        return render_template('result.html', result=result, score=f"{score:.2f}", confidence=f"{confidence:.2f}")

if __name__ == '__main__':
    # Run the Flask web server
    app.run(debug=True)

# Note: To run the web server, Flask templates (HTML files) should be created, such as 'index.html' for the form input and 'result.html' to display the prediction results.


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
