- The file loads movie data from data files stored on drive
- The movie details and reviews data-frames are formatted
- Required fields are combined in single data frame
- The strings are tokenized to form training data

In [0]:
!pip install -q gluonnlp
!pip install -q mxnet
!pip install PyDrive

[?25l[K     |█▎                              | 10kB 23.2MB/s eta 0:00:01[K     |██▋                             | 20kB 4.0MB/s eta 0:00:01[K     |███▉                            | 30kB 5.3MB/s eta 0:00:01[K     |█████▏                          | 40kB 5.4MB/s eta 0:00:01[K     |██████▌                         | 51kB 4.7MB/s eta 0:00:01[K     |███████▊                        | 61kB 5.1MB/s eta 0:00:01[K     |█████████                       | 71kB 5.6MB/s eta 0:00:01[K     |██████████▍                     | 81kB 6.0MB/s eta 0:00:01[K     |███████████▋                    | 92kB 6.4MB/s eta 0:00:01[K     |█████████████                   | 102kB 6.4MB/s eta 0:00:01[K     |██████████████▎                 | 112kB 6.4MB/s eta 0:00:01[K     |███████████████▌                | 122kB 6.4MB/s eta 0:00:01[K     |████████████████▉               | 133kB 6.4MB/s eta 0:00:01[K     |██████████████████▏             | 143kB 6.4MB/s eta 0:00:01[K     |███████████████████▍      

In [0]:
import json, csv, re, nltk

import pandas as pd
import numpy as np
import gluonnlp as nlp

from keras.preprocessing.text import Tokenizer

from google.colab import auth, drive
from oauth2client.client import GoogleCredentials

Accessing data files from google drive

- Open the link that opens up and copy the verification code

In [0]:
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


Parse the reviews data from json file. Remove duplicate review text and update review date format.

In [0]:
def parse_reviews_data(file_loc):
  reviews = pd.read_json(file_loc, lines=True).drop_duplicates('review_text').sample(frac=1)

  reviews.review_date = pd.to_datetime(reviews.review_date, infer_datetime_format=True)
  reviews.user_id = reviews.user_id.astype('category')

  review_fields = [field for field in reviews]
  return reviews

Parse the movie data from json file and format relase date format

In [0]:
def parse_movie_details_data(file_loc):
  details = pd.read_json(file_loc, lines=True)

  details.release_date = pd.to_datetime(details.release_date, infer_datetime_format=True)

  return details

Cleaning the punctuations and tokenizing the sentences

In [0]:
def tokenize_text(sentences):
  tokenizer = Tokenizer(num_words = None, \
                        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', \
                        lower = True, split = ' ')
  tokenizer.fit_on_texts(sentences) 

Join data sets to extract required fields.
Extracting field to prepare training data set

Data_type field:
- 1: only movie synopses
- 2: only moview summary
- Otherwise: Larger of synopses or summary

In [0]:
def merge_datasets(movie_details, user_reviews, id, suffixes, data_type=0):
  merged_ds = user_reviews.merge(movie_details, on=id, how="left", suffixes=suffixes)

  train = pd.DataFrame(columns=['movie_id', 'sentence_1', 'sentence_2', 'label'])
  train['movie_id'] = merged_ds['movie_id']

  movie_synopses = pd.Series([str(synopsis) for synopsis in merged_ds['plot_synopsis']])
  movie_summary = pd.Series([re.split('\s*Written by\s*\n', 
                                      str(plot))[0] for plot in merged_ds['plot_summary']])
  
  if data_type == 1: 
    train['sentence_1'] = pd.Series([movie_synopses[i] for i in range(len(movie_synopses)) ])
  elif data_type == 2:
    train['sentence_1'] = pd.Series([movie_summary[i] for i in range(len(movie_summary))])
  else:
    train['sentence_1'] = pd.Series([movie_synopses[i] \
                                    if len(movie_synopses[i]) > len(movie_summary[i]) else movie_summary[i] \
                                    for i in range(len(movie_synopses))])
    
  train['sentence_2'] = merged_ds['review_summary'] + ' ' + merged_ds['review_text']

  train['label'] = merged_ds['is_spoiler']

  train = train.sort_values(by=['label'], ascending=False)

  return train

Data Preprocessing:
- Loading Reviews data
- Loading Movie details data
- Performing natural join on movie ID to extract  user's review summary (or title), review detail and the plot synopsis

Note: The snippet take execution time as each sentence is converted into tokens

In [0]:
details_file = '/content/gdrive/My Drive/Colab Notebooks/imdb-spoiler-dataset/IMDB_movie_details.json'
reviews_file = '/content/gdrive/My Drive/Colab Notebooks/imdb-spoiler-dataset/IMDB_reviews.json'

df_details = parse_movie_details_data(details_file)
df_reviews = parse_reviews_data(reviews_file)
df_training_data_synps = merge_datasets(df_details, df_reviews, "movie_id", ('_review','_movie'), 1)
# df_training_data_sumry = merge_datasets(df_details, df_reviews, "movie_id", ('_review','_movie'), 2)
# df_training_data_combo = merge_datasets(df_details, df_reviews, "movie_id", ('_review','_movie'))

Training Data:
- Movie ID
- Sentence 1: The movie plot summary and plot synopsys
- Sentence 2: The review text
- Label: Indicates if the review contains a spoiler or not

In [0]:
print(df_training_data_synps['label'].value_counts())
df_training_data_synps.head()

False    422529
True     150856
Name: label, dtype: int64


Unnamed: 0,movie_id,sentence_1,sentence_2,label
0,tt1568346,Note: this is an English-language adaptation o...,"Confusing, unless you have read the book first...",True
441542,tt2404435,In this remake of the 1960 film of the same na...,There Were Seven - They Weren't Magnificent Th...,True
179031,tt0408236,"Benjamin Barker (Johnny Depp), a skilled barbe...",this movie gave me chills... This movie made m...,True
441577,tt3498820,"In 1991, Bucky Barnes (Sebastian Stan), brainw...",One of the best Marvel movies This movie helps...,True
179040,tt0109830,The film begins with a feather falling to the ...,Life through a different view! This is a lovel...,True


In [0]:
df_training_data_sumry.head()

In [0]:
df_training_data_combo.head()

Saving each data to file

In [0]:
training_file_1_loc = '/content/gdrive/My Drive/Colab Notebooks/imdb-spoiler-dataset/movie_training_data_synps.csv'
df_training_data_synps.to_csv(training_file_1_loc, index=False)

In [0]:
training_file_2_loc = '/content/gdrive/My Drive/Colab Notebooks/imdb-spoiler-dataset/movie_training_data_sumry..csv'
df_training_data_sumry.to_csv(training_file_2_loc, index=False)

In [0]:
training_file_3_loc = '/content/gdrive/My Drive/Colab Notebooks/imdb-spoiler-dataset/movie_training_data_combo.csv'
df_training_data_combo.to_csv(training_file_3_loc, index=False)

Converting text to words list

In [0]:
target_cols = ['sentence_1','sentence_2']

In [0]:
def text_to_word_list(text):
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [0]:
def convert_sentence_to_vector(data_set, target_cols):
  for index, row in data_set.iterrows():
    for col in target_cols:
      embedding = []

      for word in text_to_word_list(row[col]):

        if word in stops and word not in word_to_vec.vocab: continue

        if word not in vocab:
          vocab[word] = len(inverse_vocab)
          inverse_vocab.append(word)

        # add embedded word to embeddings list
        embedding.append(vocab[word])
      
      # converting sentences to embeddings
      data_set.at[index, col] = embedding

In [0]:
convert_sentence_to_vector(df_training_data, target_cols)

Save training data to drive

In [0]:
training_file_loc = '/content/gdrive/My Drive/Colab Notebooks/imdb-spoiler-dataset/movie_training_data.csv'
df_training_data.to_csv(training_file_loc, index=False)