In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import recall_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, Dropout, SimpleRNN, InputLayer
from tensorflow.keras.regularizers import L2
import string

In [2]:
df_movie_details = pd.read_json("../data/IMDB_movie_details.json", lines = True)
df_reviews = pd.read_json("../data/IMDB_reviews.json", lines = True)

## 1. Data Processing

### 1.1 Data Cleaning

In [50]:
# Tokenization, lowercase, remove stop words, lemmatization
def tokenize(text):
    return word_tokenize(text)
def lower_and_remove_stop_words(tokenized_text):
    stop_words_and_punctuations = set(stopwords.words('english') + list(string.punctuation))
    return [word.lower() for word in tokenized_text if word.lower() not in stop_words_and_punctuations]
def lemmatize(word):
    lemmatizer = WordNetLemmatizer()
    lemmatized = lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(word, pos = "v"), pos = "a"), pos = "s"))
    return lemmatized

def clean(text):
    tokenized = tokenize(text)
    no_stop_words = lower_and_remove_stop_words(tokenized)
    lemmatized = [lemmatize(word) for word in no_stop_words]
    return lemmatized

df_reviews["tokenized_reviews"] = df_reviews["review_text"].apply(clean)
df_reviews["text_tokenized"] = df_reviews['tokenized_reviews'].apply(lambda x: ' '.join(x))

In [51]:
# Save cleaned dataset
df_reviews.to_pickle("../data/cleaned_dataset.pkl.gz", compression = 'gzip')

In [3]:
# Load cleaned dataset
df_reviews = pd.read_pickle("../data/cleaned_dataset.pkl.gz", compression = 'gzip')

### 1.2 Train Test Split

In [4]:
# Split into training and testing data before further process
df_movie_details["release_date"] = pd.to_datetime(df_movie_details["release_date"])

train_movies = df_movie_details["movie_id"].loc[df_movie_details["release_date"] < pd.Timestamp("2015-01-01")]
test_movies = df_movie_details["movie_id"].loc[df_movie_details["release_date"] >= pd.Timestamp("2015-01-01")]

df_train = df_reviews.loc[df_reviews["movie_id"].isin(train_movies)]
df_test = df_reviews.loc[df_reviews["movie_id"].isin(test_movies)]

### 1.3 Imbalanced Data Mitigation

In [5]:
# Undersample both training and testing dataset separately
df_train_spoiler = df_train.loc[df_reviews['is_spoiler'] == True]
df_train_non_spoiler = df_train.loc[df_reviews['is_spoiler'] == False]

df_train_non_spoiler = df_train_non_spoiler.sample(n = len(df_train_spoiler), random_state = 42)
df_train = pd.concat([df_train_spoiler, df_train_non_spoiler])

df_test_spoiler = df_test.loc[df_reviews['is_spoiler'] == True]
df_test_non_spoiler = df_test.loc[df_reviews['is_spoiler'] == False]

df_test_non_spoiler = df_test_non_spoiler.sample(n = df_test_spoiler["is_spoiler"].count(), random_state = 42)
df_test = pd.concat([df_test_spoiler, df_test_non_spoiler])

## 2. Feature Engineering

### 2.1 Bag of Words

In [6]:
bow_vectorizer = CountVectorizer()
bow_train = bow_vectorizer.fit_transform(df_train["text_tokenized"])
bow_test = bow_vectorizer.transform(df_test["text_tokenized"])

### 2.2 TF-IDF

In [7]:
tfidfvectorizer = TfidfVectorizer()
tfidf_train = tfidfvectorizer.fit_transform(df_train['text_tokenized'])
tfidf_test = tfidfvectorizer.transform(df_test['text_tokenized'])

### 2.3 Word Embedding

In [18]:
tokenizer = Tokenizer(num_words = 10000, oov_token = "<OOV>")
tokenizer.fit_on_texts(df_train["text_tokenized"])

train_sequences = tokenizer.texts_to_sequences(df_train["text_tokenized"])
test_sequences = tokenizer.texts_to_sequences(df_test["text_tokenized"])

train_padded = pad_sequences(train_sequences, maxlen = 50, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen = 50, padding='post', truncating='post')

In [None]:
embedding_train = Embedding(input_dim = 10000, output_dim = 128, input_length = 50)(train_padded)
embedding_test = Embedding(input_dim = 10000, output_dim = 128, input_length = 50)(test_padded)

In [9]:
embedding_train = embedding_train.numpy().mean(axis = 2)
embedding_test = embedding_test.numpy().mean(axis = 2)

## 3. Model Building

In [8]:
# logistic regression model
bow_logistic_model = LogisticRegression(max_iter = 1e5)
tfidf_logistic_model = LogisticRegression(max_iter = 1e5)
embedding_logistic_model = LogisticRegression(max_iter = 1e5)

In [9]:
bow_logistic_model.fit(bow_train, df_train["is_spoiler"])
y_test = df_test["is_spoiler"]
y_pred = bow_logistic_model.predict(bow_test)

print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix
[[13657  7178]
 [ 7769 13066]]
Classification Report
              precision    recall  f1-score   support

       False       0.64      0.66      0.65     20835
        True       0.65      0.63      0.64     20835

    accuracy                           0.64     41670
   macro avg       0.64      0.64      0.64     41670
weighted avg       0.64      0.64      0.64     41670



In [10]:
tfidf_logistic_model.fit(tfidf_train, df_train["is_spoiler"])
y_test = df_test["is_spoiler"]
y_pred = tfidf_logistic_model.predict(tfidf_test)

print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix
[[13920  6915]
 [ 6883 13952]]
Classification Report
              precision    recall  f1-score   support

       False       0.67      0.67      0.67     20835
        True       0.67      0.67      0.67     20835

    accuracy                           0.67     41670
   macro avg       0.67      0.67      0.67     41670
weighted avg       0.67      0.67      0.67     41670



In [14]:
embedding_logistic_model.fit(embedding_train, df_train["is_spoiler"])
y_test = df_test["is_spoiler"]
y_pred = embedding_logistic_model.predict(embedding_test)

print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix
[[11294  9541]
 [10611 10224]]
Classification Report
              precision    recall  f1-score   support

       False       0.52      0.54      0.53     20835
        True       0.52      0.49      0.50     20835

    accuracy                           0.52     41670
   macro avg       0.52      0.52      0.52     41670
weighted avg       0.52      0.52      0.52     41670



In [30]:
# svm_model
bow_svm_model = SVC(kernel = "sigmoid", max_iter = 1e4)
tfidf_svm_model = SVC(kernel = "sigmoid", max_iter = 1e4)

In [None]:
bow_svm_model.fit(bow_train, df_train["is_spoiler"])
y_test = df_test["is_spoiler"]
y_pred = bow_svm_model.predict(bow_test)

print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
tfidf_svm_model.fit(tfidf_train, df_train["is_spoiler"])
y_test = df_test["is_spoiler"]
y_pred = tfidf_svm_model.predict(tfidf_test)

print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

print("Classification Report")
print(classification_report(y_test, y_pred))

In [12]:
tfidf_coo_train = tfidf_train.tocoo()
indices = np.mat([tfidf_coo_train.row, tfidf_coo_train.col]).transpose()
tfidf_sparse_train = tf.sparse.reorder(tf.SparseTensor(indices, tfidf_coo_train.data, tfidf_coo_train.shape))

tfidf_coo_test = tfidf_test.tocoo()
indices = np.mat([tfidf_coo_test.row, tfidf_coo_test.col]).transpose()
tfidf_sparse_test = tf.sparse.reorder(tf.SparseTensor(indices, tfidf_coo_test.data, tfidf_coo_test.shape))

In [13]:
model = Sequential()
model.add(InputLayer(input_shape = (tfidf_train.shape[1],), sparse = True))
model.add(Dense(128, activation = "relu", kernel_regularizer = L2(1e-3)))
model.add(Dropout(0.2))
model.add(Dense(1, activation = "sigmoid"))

model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

In [15]:
model.fit(tfidf_sparse_train, df_train["is_spoiler"].to_numpy(), validation_data = (tfidf_sparse_test, df_test["is_spoiler"].to_numpy()), epochs = 1)

  28/8131 [..............................] - ETA: 38:52 - loss: 0.7304 - accuracy: 0.5513

KeyboardInterrupt: 

In [16]:
# RNN model
rnn_model = Sequential()
rnn_model.add(Embedding(input_dim = 10000, output_dim = 128, input_length = 50))
rnn_model.add(SimpleRNN(units = 128, kernel_regularizer = L2(1e-3)))
rnn_model.add(Dropout(0.2))
rnn_model.add(Dense(128, activation = "relu"))
rnn_model.add(Dense(1, activation='sigmoid'))

rnn_model.compile(optimizer = "adam", loss='binary_crossentropy', metrics=['accuracy'])

In [21]:
rnn_model.fit(train_padded, df_train["is_spoiler"], validation_data = (test_padded, df_test["is_spoiler"]), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2bea9fa3a90>

In [27]:
y_test = df_test["is_spoiler"]
y_pred = rnn_model.predict(test_padded, verbose = False) > 0.5

print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix
[[19761  1074]
 [19245  1590]]
Classification Report
              precision    recall  f1-score   support

       False       0.51      0.95      0.66     20835
        True       0.60      0.08      0.14     20835

    accuracy                           0.51     41670
   macro avg       0.55      0.51      0.40     41670
weighted avg       0.55      0.51      0.40     41670



In [19]:
# LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim = 10000, output_dim = 128, input_length = 50))
lstm_model.add(Bidirectional(LSTM(units = 128, kernel_regularizer = L2(1e-3))))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(128, activation = "relu"))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(optimizer = "adam", loss='binary_crossentropy', metrics=['accuracy'])

In [22]:
lstm_model.fit(train_padded, df_train["is_spoiler"], validation_data = (test_padded, df_test["is_spoiler"]), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2beb1a52640>

In [26]:
y_test = df_test["is_spoiler"]
y_pred = lstm_model.predict(test_padded, verbose = False) > 0.5

print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix
[[11812  9023]
 [ 8032 12803]]
Classification Report
              precision    recall  f1-score   support

       False       0.60      0.57      0.58     20835
        True       0.59      0.61      0.60     20835

    accuracy                           0.59     41670
   macro avg       0.59      0.59      0.59     41670
weighted avg       0.59      0.59      0.59     41670

