In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
import re

In [2]:
df = pd.read_json('../data/IMDB_reviews.json', lines=True)

In [5]:
df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


In [None]:
# Undersample non-spoiler reviews
spoilers = df[df['is_spoiler'] == True]
non_spoilers = df[df['is_spoiler'] == False]

non_spoilers_downsampled = resample(non_spoilers,
                                    replace=False, 
                                    n_samples=len(spoilers),  
                                    random_state=42)   

df_balanced = pd.concat([spoilers, non_spoilers_downsampled])


df_balanced['is_spoiler'].value_counts()

is_spoiler
True     150924
False    150924
Name: count, dtype: int64

In [7]:
# Drop columns that are not needed
df_balanced = df_balanced.drop(columns=['movie_id', 'user_id', 'review_date'])

In [9]:
# Clean the text
def clean_text(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    return text

df_balanced['review_text'] = df_balanced['review_text'].apply(clean_text)


In [10]:
# Handle missing values
df_balanced = df_balanced.dropna()

In [11]:
df_balanced.head()  

Unnamed: 0,is_spoiler,review_text,rating,review_summary
0,True,in its oscar year shawshank redemption writt...,10,A classic piece of unforgettable film-making.
1,True,the shawshank redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,True,i believe that this film is the best story eve...,8,The best story ever told on film
3,True,yes there are spoilers here this film has ...,10,Busy dying or busy living?
4,True,at the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


In [12]:
df_balanced.to_csv('../data/IMDB_reviews_preprocessed.csv', index=False)