In [1]:
import json
import numpy as np
import pandas as pd
import csv
import gluonnlp as nlp
import multiprocessing as mp

## Load and merge the data
The source is Kaggle, [IMDB Spoiler Dataset](https://www.kaggle.com/rmisra/imdb-spoiler-dataset). The downloaded zip includes two files, we will demonstrate some of their contents below.

In [2]:
df_reviews = pd.read_json('./data/imdb/IMDB_reviews.json', lines=True).\
             drop_duplicates('review_text').sample(frac=1)
df_reviews.review_date = pd.to_datetime(df_reviews.review_date, infer_datetime_format=True)
df_reviews.user_id = df_reviews.user_id.astype('category')
print(df_reviews.shape)
review_fields = [field for field in df_reviews]
df_reviews.tail()

(573385, 7)


Unnamed: 0,is_spoiler,movie_id,rating,review_date,review_summary,review_text,user_id
503432,False,tt0107362,9,2016-09-04,Waay underrated delightful entertainment...,"I'm giving it a 9, since the movie tickles you...",ur20597848
14719,False,tt0110912,10,1999-02-19,"Appropriately named, excellently crafted.",This movie is excellent. Why? Most important...,ur0220236
333343,False,tt0349205,10,2004-02-16,HILARIOUS!,This movie is great. There are some really fu...,ur3110774
557576,False,tt0169547,10,2000-03-04,The best movie I've ever seen. It opened my ey...,American Beauty is absolutely beautiful. I'm s...,ur0471080
265667,False,tt1396484,8,2017-09-20,Wait. This was good?? (L)IT!,"So, I went and seen IT with my girlfriend and ...",ur53564317


In [3]:
# the total number of spoilers in dataset
num_spoilers = sum(df_reviews.is_spoiler)
print('num of spoilers in all: %d\nration of spoilers in all: %.4f\na dummy classifier can achieve acc of %.4f' \
      % (num_spoilers, num_spoilers / df_reviews.shape[0], \
         1 - num_spoilers / df_reviews.shape[0]))
# only less than 20% of the user comments' explicitly say that they contain spoilers
num_notice = df_reviews.review_text.str.contains('spoiler', case=False).sum() / \
             df_reviews.is_spoiler.sum()
print('a rough estimation of how many users declare their reviews as spoilers: %.4f' % num_notice)

num of spoilers in all: 150856
ration of spoilers in all: 0.2631
a dummy classifier can achieve acc of 0.7369
a rough estimation of how many users declare their reviews as spoilers: 0.1752


In [4]:
tokenizer = nlp.data.SpacyTokenizer('en')
def get_word_count(x):
    return len(tokenizer(x))

with mp.Pool() as pool:
    length_list = pool.map(get_word_count, df_reviews.review_text)

In [5]:
print('the average number of words in a review is:', round(sum(length_list) / df_reviews.shape[0]))

the average number of words in a review is: 303


In [6]:
# imdb information about the movies
df_movies = pd.read_json('./data/imdb/IMDB_movie_details.json', lines=True)
df_movies.release_date = pd.to_datetime(df_movies.release_date, infer_datetime_format=True)
print(df_movies.shape)
df_movies.tail()

(1572, 7)


Unnamed: 0,duration,genre,movie_id,plot_summary,plot_synopsis,rating,release_date
1567,1h 53min,"[Sci-Fi, Thriller]",tt0289879,Evan Treborn grows up in a small town with his...,"In the year 1998, Evan Treborn (Ashton Kutcher...",7.7,2004-01-23
1568,1h 41min,[Drama],tt1723811,Brandon is a 30-something man living in New Yo...,"Brandon (Michael Fassbender) is a successful, ...",7.2,2012-01-13
1569,1h 46min,"[Action, Drama, History]",tt5013056,Evacuation of Allied soldiers from the British...,The film alternates between three different pe...,8.1,2017-07-21
1570,1h 33min,"[Comedy, Drama]",tt0104014/,"For a while now, beautiful 24-year-old Diana B...",,5.3,1992-02-21
1571,1h 32min,"[Drama, Thriller]",tt0114142/,"The marriage of David Burgess, a senior execut...",,4.0,1999-01-29


In [7]:
with mp.Pool() as mp:
    length_list = mp.map(get_word_count, df_movies.plot_summary)
print('the average number of words in a plot summary is %d' % \
      round(sum(length_list) / df_movies.shape[0]))

the average number of words in a plot summary is 120


In [8]:
# join the two dataframe according to movie id
df_reviews = df_reviews.merge(df_movies, on="movie_id",how="left",suffixes=('_review','_movie'))

In [9]:
columns_keep = ['is_spoiler', 'movie_id', 'plot_summary', 'review_summary', 'review_text',]
all_columns = [review for review in df_reviews]
for x in columns_keep:
    all_columns.remove(x)
drop_columns = all_columns
df_train = df_reviews.drop(columns=drop_columns)
df_train['review_text'] = df_reviews['review_summary']+ ' ' + df_reviews['review_text']
df_train = df_train.drop(columns=['review_summary'])
df_train.tail()

Unnamed: 0,is_spoiler,movie_id,review_text,plot_summary
573380,False,tt0107362,Waay underrated delightful entertainment... I'...,Young Danny Madigan is a big fan of Jack Slate...
573381,False,tt0110912,"Appropriately named, excellently crafted. This...",Jules Winnfield (Samuel L. Jackson) and Vincen...
573382,False,tt0349205,HILARIOUS! This movie is great. There are som...,"The Bakers, a family of 14, move from small-to..."
573383,False,tt0169547,The best movie I've ever seen. It opened my ey...,After his death sometime in his forty-third ye...
573384,False,tt1396484,"Wait. This was good?? (L)IT! So, I went and se...","In the Town of Derry, the local kids are disap..."


## Save the processed data for future use

In [10]:
print('# of rows before dropping incomplete entries:', df_train.shape[0])
dt_train=df_train.dropna()
print('# of rows after dropping incomplete entries:', df_train.shape[0])
df_train.to_csv('./data/train.csv', index=False)
df_train.head()

# of rows before dropping incomplete entries: 573385
# of rows after dropping incomplete entries: 573378


Unnamed: 0,is_spoiler,movie_id,review_text,plot_summary
0,False,tt1024648,Slow paced but interesting story based on true...,"In 1979, the American embassy in Iran was inva..."
1,False,tt0914798,"An ethnic propaganda This is a very sick, sci-...",Young Bruno lives a wealthy lifestyle in prewa...
2,False,tt0265208,The most underrated movie of all time! I first...,18-year-old Matthew Kidman is a straight 'A' o...
3,False,tt1951261,This is a fine ending to the series if it is t...,In the aftermath of the death of Alan's father...
4,False,tt0118617,One of the best! Anastasia (1997) is honestly ...,"The daughter of the last Russian Tsar, Nicolas..."
