In [1]:
import json
import numpy as np
import pandas as pd
import csv
import gluonnlp as nlp
import multiprocessing as mp
import re

## Load and merge the data
The source is Kaggle, [IMDB Spoiler Dataset](https://www.kaggle.com/rmisra/imdb-spoiler-dataset). The downloaded zip includes two files, we will demonstrate some of their contents below.

In [2]:
df_reviews = pd.read_json('./data/imdb/IMDB_reviews.json', lines=True).\
             drop_duplicates('review_text').sample(frac=1)
df_reviews.review_date = pd.to_datetime(df_reviews.review_date, infer_datetime_format=True)
df_reviews.user_id = df_reviews.user_id.astype('category')
print(df_reviews.shape)
review_fields = [field for field in df_reviews]
df_reviews.tail()

(573385, 7)


Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
168708,2012-05-20,tt1605630,ur4732215,False,I remember when i was just a little kid watchi...,8,The gang is back and im loving it!
511313,1999-11-09,tt0109707,ur0498475,False,The director of great visual masterpieces such...,9,"Funny, touching, inspired, well acted Tim Burt..."
362439,2005-05-02,tt0349683,ur5241654,False,"This King Arthur was the best, most realistic ...",9,Best Arthur I have ever scene.
303744,2006-08-06,tt0244244,ur4555708,False,My head is still shaking in disbelief after kn...,1,Godawful ridiculous no-brain movie
108624,2015-08-20,tt2193021,ur59669062,False,I haven't watched a series live since literall...,10,I hate TV...I love this....


In [3]:
# the total number of spoilers in dataset
num_spoilers = sum(df_reviews.is_spoiler)
print('num of spoilers in all: %d\nration of spoilers in all: %.4f\na dummy classifier can achieve acc of %.4f' \
      % (num_spoilers, num_spoilers / df_reviews.shape[0], \
         1 - num_spoilers / df_reviews.shape[0]))
# only less than 20% of the user comments' explicitly say that they contain spoilers
num_notice = df_reviews.review_text.str.contains('spoiler', case=False).sum() / \
             df_reviews.is_spoiler.sum()
print('a rough estimation of how many users declare their reviews as spoilers: %.4f' % num_notice)

num of spoilers in all: 150856
ration of spoilers in all: 0.2631
a dummy classifier can achieve acc of 0.7369
a rough estimation of how many users declare their reviews as spoilers: 0.1752


In [4]:
tokenizer = nlp.data.SpacyTokenizer('en')
def get_word_count(x):
    return len(tokenizer(x))

with mp.Pool() as pool:
    length_list = pool.map(get_word_count, df_reviews.review_text)

In [5]:
print('the average number of words in a review is:', round(sum(length_list) / df_reviews.shape[0]))

the average number of words in a review is: 303


In [6]:
# imdb information about the movies
df_movies = pd.read_json('./data/imdb/IMDB_movie_details.json', lines=True)
df_movies.release_date = pd.to_datetime(df_movies.release_date, infer_datetime_format=True)
print(df_movies.shape)
df_movies.tail()

(1572, 7)


Unnamed: 0,duration,genre,movie_id,plot_summary,plot_synopsis,rating,release_date
1567,1h 53min,"[Sci-Fi, Thriller]",tt0289879,Evan Treborn grows up in a small town with his...,"In the year 1998, Evan Treborn (Ashton Kutcher...",7.7,2004-01-23
1568,1h 41min,[Drama],tt1723811,Brandon is a 30-something man living in New Yo...,"Brandon (Michael Fassbender) is a successful, ...",7.2,2012-01-13
1569,1h 46min,"[Action, Drama, History]",tt5013056,Evacuation of Allied soldiers from the British...,The film alternates between three different pe...,8.1,2017-07-21
1570,1h 33min,"[Comedy, Drama]",tt0104014/,"For a while now, beautiful 24-year-old Diana B...",,5.3,1992-02-21
1571,1h 32min,"[Drama, Thriller]",tt0114142/,"The marriage of David Burgess, a senior execut...",,4.0,1999-01-29


In [7]:
with mp.Pool() as pool:
    length_list = pool.map(get_word_count, df_movies.plot_summary)
print('the average number of words in a plot summary is %d' % \
      round(sum(length_list) / df_movies.shape[0]))

the average number of words in a plot summary is 120


In [8]:
# join the two dataframe according to movie id
df_reviews = df_reviews.merge(df_movies, on="movie_id",how="left",suffixes=('_review','_movie'))

In [24]:
columns_keep = ['is_spoiler', 'movie_id', 'plot_summary', 'review_summary', 'review_text',]
all_columns = [review for review in df_reviews]
for x in columns_keep:
    all_columns.remove(x)
df_train = pd.DataFrame(columns=['sentence1', 'sentence2', 'label'])
df_train['sentence1'] = pd.Series([re.split('\s*Written by\s*\n', str(plot))[0] for plot in df_reviews['plot_summary']])
df_train['sentence2'] = df_reviews['review_summary'] + ' ' + df_reviews['review_text']
df_train['label'] = df_reviews['is_spoiler']
df_train.tail()

Unnamed: 0,sentence1,sentence2,label
573380,Once again we're plunged into the world of swo...,Don't watch it! Its a piece of sh*t made for m...,False
573381,Garland's novel centers on a young nicotine-ad...,"Read the novel, it's a lot better then the fil...",False
573382,"The story of Seita and Satsuko, two young Japa...",More moving than you could ever expect This fi...,False
573383,Epic adventure Exodus: Gods and Kings is the s...,A MOVIE! and a rarity-a thought provoking movi...,True
573384,"Incarcerated and charged with murder, David Aa...",less than meets the eye (spoilers) This film i...,False


In [31]:
df_train['sentence1'][0]

'Inside a snowflake exists the magical land of Whoville. In Whoville, live the Whos, an almost mutated sort of munchkinlike people. All the Whos love Christmas, yet just outside of their beloved Whoville lives the Grinch. The Grinch is a nasty creature that hates Christmas, and plots to steal it away from the Whos which he equally abhors. Yet a small child, Cindy Lou Who, decides to try befriend the Grinch.'

## Save the processed data for future use

In [32]:
print('# of rows before dropping incomplete entries:', df_train.shape[0])
dt_train=df_train.dropna()
print('# of rows after dropping incomplete entries:', df_train.shape[0])
df_train.to_csv('./data/imdb/train.csv', index=False)
df_train.head()

# of rows before dropping incomplete entries: 573385
# of rows after dropping incomplete entries: 573385


Unnamed: 0,sentence1,sentence2,label
0,Inside a snowflake exists the magical land of ...,Great one but doesn't set up there with the or...,False
1,"In an ancient time, predating the pyramids, th...",TSK: A Mildly Amusing Romp...But No People's E...,False
2,James Bond goes on his first ever mission as a...,Why all the accolades? I have to tell you that...,False
3,The Lost City of Z tells the incredible true s...,Movie fell down I read this book many years ag...,False
4,"This is the tale of Harry Potter, an ordinary ...","Gryffindor 150, Muggle Director 0 After readin...",False
