In [1]:
import numpy as np
import pandas as pd
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

In [2]:
# load datasets given
df_movie_details = pd.read_json("../data/IMDB_movie_details.json", lines = True)
df_reviews = pd.read_json("../data/IMDB_reviews.json", lines = True)

In [3]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573913 entries, 0 to 573912
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   review_date     573913 non-null  object
 1   movie_id        573913 non-null  object
 2   user_id         573913 non-null  object
 3   is_spoiler      573913 non-null  bool  
 4   review_text     573913 non-null  object
 5   rating          573913 non-null  int64 
 6   review_summary  573913 non-null  object
dtypes: bool(1), int64(1), object(5)
memory usage: 26.8+ MB


In [4]:
df_reviews.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


In [5]:
df_reviews.isnull().sum()

review_date       0
movie_id          0
user_id           0
is_spoiler        0
review_text       0
rating            0
review_summary    0
dtype: int64

In [6]:
df_reviews["is_spoiler"].value_counts()

False    422989
True     150924
Name: is_spoiler, dtype: int64

In [7]:
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(df_reviews['review_text'])

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_reviews['review_text'], df_reviews['is_spoiler'].astype('int'), test_size=0.2)

In [9]:
train_token = tokenizer.texts_to_sequences(X_train)
test_token = tokenizer.texts_to_sequences(X_test)

In [13]:
print(train_token[0])

[11, 12, 643, 17, 48, 81, 2, 3, 50, 234, 923, 10, 8, 13, 7, 44, 39, 9, 57, 17, 31, 243, 10, 5, 416, 57, 5, 80, 8, 63, 32, 5, 323, 42, 596, 326, 1, 4, 482, 3, 221, 353, 29, 104, 339, 79, 330, 17, 140, 2, 36, 786, 889, 30, 1, 119, 17, 3, 243, 43, 14, 7, 11, 453, 28, 10, 225, 35, 1, 407, 4, 1, 18, 8, 564, 278, 1, 167, 29, 8, 93, 116, 47, 1, 18, 243, 13, 1, 182, 16, 11, 12, 492, 32, 9, 71, 31, 243, 10, 17, 29, 1, 961, 31, 243, 1, 18, 3, 50, 12, 8, 63, 372, 236, 35, 11, 28]


In [14]:
padded_train = pad_sequences(train_token, maxlen=20)
padded_test = pad_sequences(test_token, maxlen=20)

In [15]:
padded_train[0]

array([243,  10,  17,  29,   1, 961,  31, 243,   1,  18,   3,  50,  12,
         8,  63, 372, 236,  35,  11,  28])

In [17]:
model = tf.keras.models.Sequential([tf.keras.layers.Embedding(2000, 128, input_length=20),
                                    tf.keras.layers.LSTM(128),
                                    tf.keras.layers.Dense(128, activation='relu'),
                                    tf.keras.layers.Dense(1, activation='sigmoid'),
                                   ])

In [18]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
             optimizer='adam',
             metrics=['accuracy'])

In [19]:
model.fit(padded_train, y_train, validation_data=(padded_test, y_test), epochs=10)

Epoch 1/10

KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate(padded_train, y_train, verbose = False)
accuracy