In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import string

## 1. Load the dataset

In [2]:
# load datasets given
df_movie_details = pd.read_json("../data/IMDB_movie_details.json", lines = True)
df_reviews = pd.read_json("../data/IMDB_reviews.json", lines = True)

In [3]:
df_movie_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1572 entries, 0 to 1571
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   movie_id       1572 non-null   object 
 1   plot_summary   1572 non-null   object 
 2   duration       1572 non-null   object 
 3   genre          1572 non-null   object 
 4   rating         1572 non-null   float64
 5   release_date   1572 non-null   object 
 6   plot_synopsis  1572 non-null   object 
dtypes: float64(1), object(6)
memory usage: 86.1+ KB


In [4]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573913 entries, 0 to 573912
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   review_date     573913 non-null  object
 1   movie_id        573913 non-null  object
 2   user_id         573913 non-null  object
 3   is_spoiler      573913 non-null  bool  
 4   review_text     573913 non-null  object
 5   rating          573913 non-null  int64 
 6   review_summary  573913 non-null  object
dtypes: bool(1), int64(1), object(5)
memory usage: 26.8+ MB


In [5]:
df_reviews = pd.read_pickle("../data/tokenized_reviews.pkl.gz", compression = 'gzip')

## 2. Text Preprocessing

### Tokenize review texts

In [3]:
# TOKENIZING WILL TAKE ~15 MINUTES
# If you want to save after tokenizing feel free to do so to save time in tokenizing again
df_reviews['tokenized_summary'] = list(map(word_tokenize, df_reviews['review_summary']))
df_reviews['tokenized_reviews'] = list(map(word_tokenize, df_reviews['review_text']))

### Removing stop words and punctuations from the list

In [6]:
# get stop words
stop_words_and_punctuations = set(stopwords.words('english') + list(string.punctuation))

In [7]:
# remove stop words and punctuations from the tokenized list
df_reviews['tokenized_summary'] = list(map(lambda x: [word.lower() for word in x if word.lower() not in stop_words_and_punctuations], df_reviews['tokenized_summary']))
df_reviews['tokenized_reviews'] = list(map(lambda x: [word.lower() for word in x if word.lower() not in stop_words_and_punctuations], df_reviews['tokenized_reviews']))

### Stemming or Lemmatisation -- (to be implemented)

In [None]:
# stem or lemmatise words
stemmer = PorterStemmer()
df_reviews['tokenized_summary'] = list(map(lambda x: [stemmer.stem(word) for word in x], df_reviews['tokenized_summary']))
df_reviews['tokenized_reviews'] = list(map(lambda x: [stemmer.stem(word) for word in x], df_reviews['tokenized_reviews']))

### Save the new dataset

In [8]:
# save changes made to original dataset to save time tokenizing etc
df_reviews.to_pickle("../data/cleaned_reviews.pkl.gz", compression = 'gzip')

## 3. Model Building

In [None]:
# Load the previously saved dataset
df_reviews = pd.read_pickle("../data/cleaned_reviews.pkl.gz", compression = 'gzip')

### Transform texts to numbers

In [None]:
df_reviews["text_tokenized"] = list(map(lambda x: ' '.join(x), df_reviews['tokenized_reviews']))

In [11]:
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(df_reviews["text_tokenized"])

### Create Model

In [12]:
X_train, X_test, y_train, y_test = train_test_split(bow, df_reviews.loc[:, 'is_spoiler'], test_size = 0.2)

In [13]:
logistic_model = LogisticRegression(max_iter = 1e3)
logistic_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=1000.0)

In [14]:
logistic_model.score(X_test, y_test)

0.7638761837554342