<a href="https://colab.research.google.com/github/szyrek/sentiment_anal/blob/main/sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install tensorflow
!pip install nltk
!pip install pandas
!pip install numpy
!pip install tensorflow-datasets

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Download the Open Multilingual WordNet
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

import json

from google.colab import drive

In [84]:
num_epochs = 10
cpu_batch = 64
gpu_batch = 1024
early_patience = 5
lstm_units = 64
model_name = '10epochs'
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
# Make sure we use correct TensorFlow version and we're running on GPU
print("TensorFlow version:", tf.__version__)
gpus = len(tf.config.experimental.list_physical_devices('GPU'))
print("Num GPUs Available: ", gpus)

# This will print the name and type of all available devices
print("Device List:", tf.config.experimental.list_physical_devices())

# Set batch size accordingly
if gpus > 0:
    reasonable_batch=gpu_batch
else:
    reasonable_batch=cpu_batch

print("Setting training batch_size to:", reasonable_batch)

In [74]:
# Load IMDb dataset
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

# Prepare data for training
# TensorFlow Datasets provides the IMDb reviews in a tensor format. We need to convert these into numpy arrays and then into a format suitable for our model
train_data = tfds.as_numpy(train_dataset)
test_data = tfds.as_numpy(test_dataset)

In [75]:
# Initialize lists to hold processed texts and labels
train_texts, train_labels = [], []
test_texts, test_labels = [], []

# Extract texts and labels from the training and test datasets correctly
for text, label in train_data:
    train_texts.append(text.decode('utf-8'))  # Decode bytes to string
    train_labels.append(label)

for text, label in test_data:
    test_texts.append(text.decode('utf-8'))  # Decode bytes to string
    test_labels.append(label)

# Convert labels into numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [67]:
# Lemmatizing text data
def preprocess_text_with_lemmatization(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha()]  # isalpha() to remove punctuation
    return " ".join(lemmatized_words)

# Applying lemmatization to the 'text' column of the dataset
train_texts = [preprocess_text_with_lemmatization(text) for text in train_texts]
test_texts = [preprocess_text_with_lemmatization(text) for text in test_texts]

In [68]:
# Filtering out stopwords
filtered_train_texts = []
filtered_test_texts = []

for text in lemmatized_train_texts:
    words = text.split()
    words_filtered = [word for word in words if word.lower() not in stop_words]
    filtered_train_texts.append(" ".join(words_filtered))

for text in lemmatized_test_texts:
    words = text.split()
    words_filtered = [word for word in words if word.lower() not in stop_words]
    filtered_test_texts.append(" ".join(words_filtered))

train_texts = filtered_train_texts
test_texts = filtered_test_texts

In [76]:
# Final tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

train_padded = pad_sequences(train_sequences, maxlen=200)
test_padded = pad_sequences(test_sequences, maxlen=200)

In [None]:
# Building simplest model
model = Sequential([
    Embedding(5000, 128, input_length=200),
    LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [53]:
# Building model with dropout layer
model = Sequential([
    Embedding(5000, 128, input_length=200),
    LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2),
    Dropout(0.5),  # dropout layer to prevent overfiting
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [None]:
# Building model with dropout and l2 regularization
model = Sequential([
    Embedding(5000, 128, input_length=200),
    LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2),
    Dropout(0.5),  # dropout layer to prevent overfiting
    Dense(64, activation='relu', kernel_regularizer=l2(0.01))
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(train_padded, train_labels, batch_size=reasonable_batch, epochs=num_epochs, validation_data=(test_padded, test_labels))

In [None]:
# Train with early stopping

# Initialize early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=early_patience)

# Include early stopping in the fit function
model.fit(train_padded, train_labels, batch_size=reasonable_batch, epochs=num_epochs, validation_data=(test_padded, test_labels), callbacks=[early_stopping])

In [None]:
# Mount Google Drive

drive.mount('/content/drive')

In [92]:
# Save the model
model_save_path = "/content/drive/My Drive/" + model_name + ".h5"
model.save(model_save_path)

# Save tokenizer
tokenizer_json = tokenizer.to_json()
with open("/content/drive/My Drive/" + model_name + "_tokenizer.json", 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import tokenizer_from_json

# Load the model
model = load_model("/content/drive/My Drive/" + model_name + ".h5")

# Load the tokenizer
with open("/content/drive/My Drive/" + model_name + "_tokenizer.json") as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

In [None]:
# Evaluate the model
scores = model.evaluate(test_padded, test_labels)
print(f"Test Accuracy: {scores[1]*100}%")

In [None]:
# Evaluate single text
text = "I wanted to pee almost since the beginning, but it was too captivating to miss any minute of it. Great flick, loved it!"

def preprocess_text(text):
    # Lemmatize the input text
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha()]  # Remove punctuation
    # Filter stopwords
    filtered_words = [word for word in lemmatized_words if word.lower() not in stop_words]
    # Tokenize using current tokenizer
    tokenized_text = tokenizer.texts_to_sequences([" ".join(filtered_words)])
    #Apply padding
    padded_text = pad_sequences(tokenized_text, maxlen=200)  # Use the same maxlen as during training
    return padded_text

# Preprocess the text
processed_text = preprocess_text(text)

# Predict (use the loaded model if you're in a new session)
prediction = model.predict(processed_text)

print("Prediction (closer to 1 means positive sentiment):", prediction[0][0])