In [44]:
import json
import numpy as np
import pandas as pd
import csv
import gluonnlp as nlp
import multiprocessing as mp

## Load and merge the data
The source is Kaggle, [IMDB Spoiler Dataset](https://www.kaggle.com/rmisra/imdb-spoiler-dataset). The downloaded zip includes two files, we will demonstrate some of their contents below.

In [21]:
df_reviews = pd.read_json('./data/IMDB_reviews.json', lines=True).\
             drop_duplicates('review_text').sample(frac=1)
df_reviews.review_date = pd.to_datetime(df_reviews.review_date, infer_datetime_format=True)
df_reviews.user_id = df_reviews.user_id.astype('category')
print(df_reviews.shape)
review_fields = [field for field in df_reviews]
df_reviews.tail()

(573385, 7)


Unnamed: 0,is_spoiler,movie_id,rating,review_date,review_summary,review_text,user_id
366001,False,tt0308644,9,2005-03-30,Believe the hype-it's true!,I'm not surprised. Why you ask? Johnny Depp pr...,ur5058511
41019,True,tt0095327,10,2014-08-03,A Moving Experience,"Beware, this is not a movie for people with we...",ur42928079
115137,False,tt1475582,10,2017-03-11,Sherlock!!!!!,What a TV series!!! What can I say about it???...,ur50846362
94000,True,tt0093779,9,2013-05-09,Whimsical and Charming,"The book was long and boring, but the film was...",ur42538340
499575,True,tt0105629,8,2004-05-18,A movie with a message and no way to say it (P...,I love this movie. I don't know what it is abo...,ur3460521


In [11]:
# the total number of spoilers in dataset
num_spoilers = sum(df_reviews.is_spoiler)
print('num of spoilers in all: %d\nration of spoilers in all: %.4f\na dummy classifier can achieve acc of %.4f' \
      % (num_spoilers, num_spoilers / df_reviews.shape[0], \
         1 - num_spoilers / df_reviews.shape[0]))
# only less than 20% of the user comments' explicitly say that they contain spoilers
num_notice = df_reviews.review_text.str.contains('spoiler', case=False).sum() / \
             df_reviews.is_spoiler.sum()
print('a rough estimation of how many users declare their reviews as spoilers: %.4f' % num_notice)

num of spoilers in all: 150856
ration of spoilers in all: 0.2631
a dummy classifier can achieve acc of 0.7369
a rough estimation of how many users declare their reviews as spoilers: 0.1752


In [25]:
tokenizer = nlp.data.SpacyTokenizer('en')
def get_word_count(x):
    return len(tokenizer(x))

with mp.Pool() as mp:
    length_list = mp.map(get_word_count, df_reviews.review_text)

[[61, 289, 654, 166, 157, 681, 201, 348, 357, 236, 232, 280, 89, 431, 200, 302, 149, 335, 90, 253, 224, 236, 138, 121, 73, 278, 138, 208, 106, 1044, 209, 179, 1130, 437, 225, 293, 218, 284, 274, 66, 127, 212, 216, 82, 310, 448, 342, 417, 313, 320, 374, 161, 45, 243, 139, 208, 100, 362, 283, 627, 68, 138, 639, 168, 423, 235, 251, 80, 300, 31, 246, 81, 236, 148, 297, 964, 191, 483, 134, 129, 767, 575, 553, 238, 168, 343, 163, 420, 193, 159, 593, 464, 170, 169, 298, 718, 134, 1002, 72, 135, 162, 264, 429, 388, 218, 495, 59, 183, 86, 267, 169, 263, 581, 192, 84, 71, 244, 262, 106, 301, 498, 151, 117, 681, 228, 866, 353, 208, 92, 255, 222, 257, 286, 162, 114, 138, 284, 254, 585, 189, 254, 156, 115, 205, 472, 471, 158, 431, 180, 307, 144, 244, 734, 231, 1129, 311, 749, 318, 560, 604, 660, 190, 171, 86, 158, 295, 213, 212, 187, 168, 340, 307, 1117, 228, 94, 383, 122, 203, 249, 103, 195, 164, 329, 139, 375, 1009, 470, 144, 528, 145, 162, 155, 540, 554, 144, 189, 155, 489, 345, 129, 461, 285, 2

In [33]:
print('the average number of words in a review is %d' % \
      round(sum(length_list[0]) / df_reviews.shape[0]))

the average number of words in a review is 303


In [39]:
# imdb information about the movies
df_movies = pd.read_json('./data/IMDB_movie_details.json', lines=True)
df_movies.release_date = pd.to_datetime(df_movies.release_date, infer_datetime_format=True)
print(df_movies.shape)
df_movies.tail()

(1572, 7)


Unnamed: 0,duration,genre,movie_id,plot_summary,plot_synopsis,rating,release_date
1567,1h 53min,"[Sci-Fi, Thriller]",tt0289879,Evan Treborn grows up in a small town with his...,"In the year 1998, Evan Treborn (Ashton Kutcher...",7.7,2004-01-23
1568,1h 41min,[Drama],tt1723811,Brandon is a 30-something man living in New Yo...,"Brandon (Michael Fassbender) is a successful, ...",7.2,2012-01-13
1569,1h 46min,"[Action, Drama, History]",tt5013056,Evacuation of Allied soldiers from the British...,The film alternates between three different pe...,8.1,2017-07-21
1570,1h 33min,"[Comedy, Drama]",tt0104014/,"For a while now, beautiful 24-year-old Diana B...",,5.3,1992-02-21
1571,1h 32min,"[Drama, Thriller]",tt0114142/,"The marriage of David Burgess, a senior execut...",,4.0,1999-01-29


In [45]:
with mp.Pool() as mp:
    length_list = mp.map(get_word_count, df_movies.plot_summary)
print('the average number of words in a plot summary is %d' % \
      round(sum(length_list) / df_movies.shape[0]))

the average number of words in a plot summary is 120


In [5]:
# join the two dataframe according to movie id
df_reviews = df_reviews.merge(df_movies, on="movie_id",how="left",suffixes=('_review','_movie'))

In [6]:
columns_keep = ['is_spoiler', 'movie_id', 'plot_summary', 'review_summary', 'review_text',]
all_columns = [review for review in df_reviews]
for x in columns_keep:
    all_columns.remove(x)
drop_columns = all_columns
df_train = df_reviews.drop(columns=drop_columns)
df_train['review_text'] = df_reviews['review_summary']+ ' ' + df_reviews['review_text']
df_train = df_train.drop(columns=['review_summary'])
df_train.tail()

Unnamed: 0,is_spoiler,movie_id,review_text,plot_summary
573380,True,tt0472043,Spectacle vs. Story -- and story loses From te...,"In the Maya civilization, a peaceful tribe is ..."
573381,False,tt0227538,Great! Spy Kids is a wonderful kids movie. I ...,Gregorio and Ingrid are the two greatest secre...
573382,False,tt0903624,An abject disaster. An unmitigated mess of a ...,Bilbo Baggins is swept into a quest to reclaim...
573383,True,tt0122690,Ronin From director John Frankenheimer (Birdma...,Ronin is the Japanese word used for Samurai wi...
573384,False,tt0208092,better than pulp fiction This film is unquesti...,Turkish and his close friend/accomplice Tommy ...


## Save the processed data for future use

In [8]:
df_train.to_csv('./data/train.csv', index=False)