In [1]:
import json
import numpy as np
import pandas as pd
import csv
import gluonnlp as nlp
import multiprocessing as mp
import re

## Load and merge the data
The source is Kaggle, [IMDB Spoiler Dataset](https://www.kaggle.com/rmisra/imdb-spoiler-dataset). The downloaded zip includes two files, we will demonstrate some of their contents below.

In [2]:
df_reviews = pd.read_json('./data/imdb/IMDB_reviews.json', lines=True).\
             drop_duplicates('review_text').sample(frac=1)
df_reviews.review_date = pd.to_datetime(df_reviews.review_date, infer_datetime_format=True)
df_reviews.user_id = df_reviews.user_id.astype('category')
print(df_reviews.shape)
review_fields = [field for field in df_reviews]
df_reviews.tail()

(573385, 7)


Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
215749,2014-09-22,tt2872732,ur55379996,False,Went to this movie with several family members...,8,Brilliant
186531,2015-09-22,tt1454468,ur10930023,False,I know this flick is classified as science fic...,2,As vacuous as space itself
542919,2002-05-18,tt0118571,ur1639009,False,A1 acting and casting. Story-line excellent b...,10,Excellently acted & portrayed. Slightly futur...
420259,2007-12-29,tt0467406,ur17896607,False,I've come to expect good things from Fox Searc...,10,a memorable piece of cinema
529592,2007-03-04,tt0117731,ur1054286,True,"It's a mystery to many people, particularly ""S...",8,One of the Best Trek Films.


In [3]:
# the total number of spoilers in dataset
num_spoilers = sum(df_reviews.is_spoiler)
print('num of spoilers in all: %d\nration of spoilers in all: %.4f\na dummy classifier can achieve acc of %.4f' \
      % (num_spoilers, num_spoilers / df_reviews.shape[0], \
         1 - num_spoilers / df_reviews.shape[0]))
# only less than 20% of the user comments' explicitly say that they contain spoilers
num_notice = df_reviews.review_text.str.contains('spoiler', case=False).sum() / \
             df_reviews.is_spoiler.sum()
print('a rough estimation of how many users declare their reviews as spoilers: %.4f' % num_notice)

num of spoilers in all: 150856
ration of spoilers in all: 0.2631
a dummy classifier can achieve acc of 0.7369
a rough estimation of how many users declare their reviews as spoilers: 0.1752


In [None]:
tokenizer = nlp.data.SpacyTokenizer('en')
def get_word_count(x):
    return len(tokenizer(x))

with mp.Pool() as pool:
    length_list = pool.map(get_word_count, df_reviews.review_text)

In [5]:
print('the average number of words in a review is:', round(sum(length_list) / df_reviews.shape[0]))

the average number of words in a review is: 303


In [3]:
# imdb information about the movies
df_movies = pd.read_json('./data/imdb/IMDB_movie_details.json', lines=True)
df_movies.release_date = pd.to_datetime(df_movies.release_date, infer_datetime_format=True)
print(df_movies.shape)
df_movies.tail()

(1572, 7)


Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis
1567,tt0289879,Evan Treborn grows up in a small town with his...,1h 53min,"[Sci-Fi, Thriller]",7.7,2004-01-23,"In the year 1998, Evan Treborn (Ashton Kutcher..."
1568,tt1723811,Brandon is a 30-something man living in New Yo...,1h 41min,[Drama],7.2,2012-01-13,"Brandon (Michael Fassbender) is a successful, ..."
1569,tt5013056,Evacuation of Allied soldiers from the British...,1h 46min,"[Action, Drama, History]",8.1,2017-07-21,The film alternates between three different pe...
1570,tt0104014/,"For a while now, beautiful 24-year-old Diana B...",1h 33min,"[Comedy, Drama]",5.3,1992-02-21,
1571,tt0114142/,"The marriage of David Burgess, a senior execut...",1h 32min,"[Drama, Thriller]",4.0,1999-01-29,


In [4]:
with mp.Pool() as pool:
    length_list = pool.map(get_word_count, df_movies.plot_synopsis)
print('the average number of words in a plot synopsis is %d' % \
      round(sum(length_list) / df_movies.shape[0]))

NameError: name 'get_word_count' is not defined

In [5]:
# join the two dataframe according to movie id
df_reviews = df_reviews.merge(df_movies, on="movie_id", how="left", suffixes=('_review','_movie'))

In [6]:
df_train = pd.DataFrame(columns=['movie_id', 'sentence1', 'sentence2', 'label'])

df_train['movie_id'] = df_reviews['movie_id']

summaries = pd.Series([re.split('\s*Written by\s*\n', str(plot))[0] for plot in df_reviews['plot_summary']])
synopses = pd.Series([str(synopsis) for synopsis in df_reviews['plot_synopsis']])
'''
num=0
for synopsis in synopses:
    if len(synopsis)>10:
        num+=1
print(num)
-> output: 538318, most of them has synopses
'''
df_train['sentence1'] = pd.Series([synopses[i] if len(synopses[i])>len(summaries[i]) else summaries[i] \
                                  for i in range(len(synopses))])
df_train['sentence2'] = df_reviews['review_summary'] + ' ' + df_reviews['review_text']
df_train['label'] = df_reviews['is_spoiler']
df_train.tail()

Unnamed: 0,movie_id,sentence1,sentence2,label
573380,tt2872732,"In the opening shot, we see a cell split up in...",Brilliant Went to this movie with several fami...,False
573381,tt1454468,"In 2014/2015, bio-medical engineer Dr. Ryan St...",As vacuous as space itself I know this flick i...,False
573382,tt0118571,The film begins with an American Special Force...,Excellently acted & portrayed. Slightly futur...,False
573383,tt0467406,The film opens with Juno (Ellen Page) staring ...,a memorable piece of cinema I've come to expec...,False
573384,tt0117731,Captain Jean-Luc Picard awakens from a nightma...,One of the Best Trek Films. It's a mystery to ...,True


In [7]:
df_train['movie_id'][233], df_train['sentence1'][233]

('tt0245844',
 "In the turbulent days in which France was transitioning away from Napoleonic rule, Edmond Dantes (Caviezel) and his closest friend, Fernand Mondego (Pearce), aspire to gain the same two things: the next captaincy of a ship in Morel's (Godfrey) Marseille-based shipping business and the hands of the lovely Mercedes Iguanada (Dominczyk).Dantes and Mondego are diverted to Elba on a shipping mission because their captain requires medical attention. Assistance comes, unexpectedly, in the form of the personal physician of the exiled Napoleon (Norton). In return for the use of his doctor, Napoleon demands that Dantes deliver a letter for him and that the mission and the letter be kept a secret. Unknown to the illiterate Dantes, the letter will provide Bonapartists in Marseille information of pertinence to a possible rescue of Napoleon. Also unknown to him, Fernand has discovered and read the letter and has full knowledge of its contents.On his return to France, Dantes's fortune

## Save the processed data for future use

In [8]:
print('# of rows before dropping incomplete entries:', df_train.shape[0])
dt_train=df_train.dropna()
print('# of rows after dropping incomplete entries:', df_train.shape[0])
df_train.to_csv('./data/imdb/train.csv', index=False)
df_train.head()

# of rows before dropping incomplete entries: 573385
# of rows after dropping incomplete entries: 573385


Unnamed: 0,movie_id,sentence1,sentence2,label
0,tt0097576,"The story opens in Monument Valley, Utah, in 1...",Chemistry between Ford and Connery puts this f...,False
1,tt2488496,"Luke Skywalker has vanished. In his absence, t...","You did it, J.J. ... You have ruined Star Wars...",True
2,tt0407887,"In voiceover, Irish-American mobster Frank Cos...","No respect! This remake of ""Internal Affairs"",...",False
3,tt0209144,This is a complex story about Leonard Shelby (...,Scientific and psychological thriller. Who kne...,True
4,tt1285016,"In October 2003, Harvard University student Ma...",A Grossly Overrated But Still Very Good Film A...,False
