In [1]:
import json
import numpy as np
import pandas as pd
import csv
import gluonnlp as nlp
import multiprocessing as mp

## Load and merge the data
The source is Kaggle, [IMDB Spoiler Dataset](https://www.kaggle.com/rmisra/imdb-spoiler-dataset). The downloaded zip includes two files, we will demonstrate some of their contents below.

In [2]:
df_reviews = pd.read_json('./data/IMDB_reviews.json', lines=True).\
             drop_duplicates('review_text').sample(frac=1)
df_reviews.review_date = pd.to_datetime(df_reviews.review_date, infer_datetime_format=True)
df_reviews.user_id = df_reviews.user_id.astype('category')
print(df_reviews.shape)
review_fields = [field for field in df_reviews]
df_reviews.tail()

(573385, 7)


Unnamed: 0,is_spoiler,movie_id,rating,review_date,review_summary,review_text,user_id
151931,False,tt0478304,1,2011-11-05,"Someone owes me 139 minutes of my life back, I...","Very seldom, and by seldom I mean never, do I ...",ur29000275
167297,True,tt0948470,4,2012-07-20,Didn't really do it for me,"I didn't really like this version, don't get m...",ur23522411
208761,False,tt1843866,9,2015-08-31,Till the end of the road.,"Stonking! Now this is more like it, after the ...",ur16161013
365587,False,tt0369339,10,2004-12-30,Simply amazing,"For a ""hit-man"" movie I give this film a 10 ou...",ur4187577
179746,False,tt1170358,9,2013-12-28,As Awesome as the first part.,I was satisfied with the first entry in the tr...,ur34875353


In [3]:
# the total number of spoilers in dataset
num_spoilers = sum(df_reviews.is_spoiler)
print('num of spoilers in all: %d\nration of spoilers in all: %.4f\na dummy classifier can achieve acc of %.4f' \
      % (num_spoilers, num_spoilers / df_reviews.shape[0], \
         1 - num_spoilers / df_reviews.shape[0]))
# only less than 20% of the user comments' explicitly say that they contain spoilers
num_notice = df_reviews.review_text.str.contains('spoiler', case=False).sum() / \
             df_reviews.is_spoiler.sum()
print('a rough estimation of how many users declare their reviews as spoilers: %.4f' % num_notice)

num of spoilers in all: 150856
ration of spoilers in all: 0.2631
a dummy classifier can achieve acc of 0.7369
a rough estimation of how many users declare their reviews as spoilers: 0.1752


In [4]:
tokenizer = nlp.data.SpacyTokenizer('en')
def get_word_count(x):
    return len(tokenizer(x))

with mp.Pool() as pool:
    length_list = pool.map(get_word_count, df_reviews.review_text)

In [8]:
print('the average number of words in a review is %d' % \
      (round(sum(length_list[0]) / df_reviews.shape[0]))

TypeError: 'int' object is not iterable

In [9]:
# imdb information about the movies
df_movies = pd.read_json('./data/IMDB_movie_details.json', lines=True)
df_movies.release_date = pd.to_datetime(df_movies.release_date, infer_datetime_format=True)
print(df_movies.shape)
df_movies.tail()

(1572, 7)


Unnamed: 0,duration,genre,movie_id,plot_summary,plot_synopsis,rating,release_date
1567,1h 53min,"[Sci-Fi, Thriller]",tt0289879,Evan Treborn grows up in a small town with his...,"In the year 1998, Evan Treborn (Ashton Kutcher...",7.7,2004-01-23
1568,1h 41min,[Drama],tt1723811,Brandon is a 30-something man living in New Yo...,"Brandon (Michael Fassbender) is a successful, ...",7.2,2012-01-13
1569,1h 46min,"[Action, Drama, History]",tt5013056,Evacuation of Allied soldiers from the British...,The film alternates between three different pe...,8.1,2017-07-21
1570,1h 33min,"[Comedy, Drama]",tt0104014/,"For a while now, beautiful 24-year-old Diana B...",,5.3,1992-02-21
1571,1h 32min,"[Drama, Thriller]",tt0114142/,"The marriage of David Burgess, a senior execut...",,4.0,1999-01-29


In [45]:
with mp.Pool() as mp:
    length_list = mp.map(get_word_count, df_movies.plot_summary)
print('the average number of words in a plot summary is %d' % \
      round(sum(length_list) / df_movies.shape[0]))

the average number of words in a plot summary is 120


In [10]:
# join the two dataframe according to movie id
df_reviews = df_reviews.merge(df_movies, on="movie_id",how="left",suffixes=('_review','_movie'))

In [11]:
columns_keep = ['is_spoiler', 'movie_id', 'plot_summary', 'review_summary', 'review_text',]
all_columns = [review for review in df_reviews]
for x in columns_keep:
    all_columns.remove(x)
drop_columns = all_columns
df_train = df_reviews.drop(columns=drop_columns)
df_train['review_text'] = df_reviews['review_summary']+ ' ' + df_reviews['review_text']
df_train = df_train.drop(columns=['review_summary'])
df_train.tail()

Unnamed: 0,is_spoiler,movie_id,review_text,plot_summary
573380,False,tt0478304,"Someone owes me 139 minutes of my life back, I...",The impressionistic story of a Texas family in...
573381,True,tt0948470,Didn't really do it for me I didn't really lik...,Peter Parker (Garfield) is an outcast high sch...
573382,False,tt1843866,Till the end of the road. Stonking! Now this i...,"For Steve Rogers, awakening after decades of s..."
573383,False,tt0369339,"Simply amazing For a ""hit-man"" movie I give th...",LA cabbie Max Durocher is the type of person w...
573384,False,tt1170358,As Awesome as the first part. I was satisfied ...,After successfully crossing over (and under) t...


## Save the processed data for future use

In [8]:
df_train.to_csv('./data/train.csv', index=False)