In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
import tensorflow as tf
import contractions

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from tensorflow.python.keras.metrics import Recall, Precision

# Download NLTK resources
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to C:\Users\Yujin
[nltk_data]     Chen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def preprocess_text(text):
    expanded_text = contractions.fix(text)
    # split into words
    tokens = word_tokenize(expanded_text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return ' '.join(words)

In [3]:
# Load dataset with UTF-8 encoding to avoid UnicodeDecodeError
def load_reviews(directory):
    reviews = []
    labels = []
    for label_type in ['pos', 'neg']:
        label = 1 if label_type == 'pos' else 0  # 1 for positive, 0 for negative
        dir_name = os.path.join(directory, label_type)
        for fname in os.listdir(dir_name):
            if fname.endswith('.txt'):
                with open(os.path.join(dir_name, fname), 'r', encoding='utf-8') as file:  # Specify UTF-8 encoding
                    reviews.append(preprocess_text(file.read()))  # Preprocess the text
                    labels.append(label)
    return reviews, labels

In [4]:
# Load and preprocess training and testing data
train_reviews, train_labels = load_reviews('train/')
test_reviews, test_labels = load_reviews('test/')

In [5]:
# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_reviews, train_labels, test_size=0.2, random_state=42)


In [6]:
# Tokenization and padding
tokenizer = Tokenizer(num_words=10000)  # Limit vocabulary to 10,000 most common words
tokenizer.fit_on_texts(X_train)  # Fit tokenizer on training data

In [7]:
# Convert text to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(test_reviews)


In [8]:
# Pad sequences to ensure they all have the same length
maxlen = 300  # You can adjust this depending on the average review length
X_train_pad = pad_sequences(X_train_seq , maxlen=maxlen)
X_val_pad = pad_sequences(X_val_seq , maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq , maxlen=maxlen)

# Define vocab_size and embedding_dim
vocab_size = 10000  # Matches the tokenizer's num_words
embedding_dim = 100 # You can adjust this based on your needs


In [9]:
# Create instances of Precision and Recall outside the metric function
precision_metric = Precision(name='precision')
recall_metric = Recall(name='recall')

# Custom F1 score metric
def f1_score(y_true, y_pred):
    precision = precision_metric(y_true, y_pred)
    recall = recall_metric(y_true, y_pred)
    f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
    return f1


In [10]:
# Convert labels to NumPy arrays
y_train = np.array(y_train)
y_val = np.array(y_val)


In [None]:
# Build the model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, mask_zero=True, input_length=maxlen))
model.add(Bidirectional(LSTM(64, return_sequences=True)))  # First Bidirectional LSTM
model.add(Bidirectional(LSTM(32)))   # Second Bidirectional LSTM
model.add(Dense(32,kernel_regularizer=regularizers.l2(0.00001), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer=tf.keras.optimizers.Adam(0.0001),
    metrics=['accuracy', precision_metric, recall_metric, f1_score]
)

# Train the model
training_history = model.fit(
    X_train_pad, y_train,
    epochs= 10,
    batch_size=512,
    validation_data=(X_val_pad, y_val),
    verbose=1
)


In [None]:
# Evaluate the model on the test set
test_labels = np.array(test_labels)
score = model.evaluate(X_test_pad, test_labels, verbose=1)

# Print the evaluation results
print(f'Test score or loss: {score[0]:.4f}')   # Loss
print(f'Test accuracy: {score[1]:.4f}')         # Accuracy
print(f'Test Precision: {score[2]:.4f}')        # Precision
print(f'Test Recall: {score[3]:.4f}')           # Recall
print(f'Test F1 Score: {score[4]:.4f}')         # F1 Score


In [None]:
# Plot training & validation accuracy values
plt.plot(training_history.history['accuracy'])
plt.plot(training_history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
fig, ax = plt.subplots()
ax.plot(training_history.history['loss'], label='Train')
ax.plot(training_history.history['val_loss'], label='Validation')
ax.legend()
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('Model loss')
ax.set_ylim(0, 1)
plt.show()

In [None]:
# Plot F1 score
fig, ax = plt.subplots()
ax.plot(training_history.history['f1_score'])
ax.set_title('Training F1 Score')
ax.set_xlabel('Epoch')
ax.set_ylabel('F1 Score')
ax.legend(['F1 Score'], loc='upper left')
ax.set_ylim(0.5, 1)
plt.show()

# Plot precision and recall
fig, ax = plt.subplots()
ax.plot(training_history.history['precision'])  
ax.plot(training_history.history['recall'])    
ax.set_title('Training Precision and Recall')
ax.set_xlabel('Epoch')
ax.set_ylabel('Metrics')
ax.set_ylim(0.7, 1) 
ax.legend(['Precision', 'Recall'], loc='upper left')
plt.show()
