We import all the packages that we needs to use.

In [1]:
import pandas as pd
import re
import numpy as np   

In [3]:
# Import data
data = pd.read_csv("movie_review_info.csv")
data.head(10)

Unnamed: 0,userId,reviewDate,reviewScore,reviewTitle,userReview,movieId
0,2509775,26 November 2003,[<span>10</span>],"[<a class=""title"" href=""/review/rw0349418/?ref...","[<div class=""text show-more__control clickable...",111161
1,1898687,10 February 2006,[<span>10</span>],"[<a class=""title"" href=""/review/rw1288098/?ref...","[<div class=""text show-more__control"">In its O...",111161
2,16161013,24 July 2010,[<span>10</span>],"[<a class=""title"" href=""/review/rw2284594/?ref...","[<div class=""text show-more__control clickable...",111161
3,1005460,8 February 2001,[],"[<a class=""title"" href=""/review/rw0348718/?ref...","[<div class=""text show-more__control"">I have n...",111161
4,997166,27 August 2002,[<span>10</span>],"[<a class=""title"" href=""/review/rw0349147/?ref...","[<div class=""text show-more__control clickable...",111161
5,131017397,2 April 2021,[<span>9</span>],"[<a class=""title"" href=""/review/rw6770639/?ref...","[<div class=""text show-more__control"">If you l...",111161
6,265899,21 November 2005,[<span>10</span>],"[<a class=""title"" href=""/review/rw1221355/?ref...","[<div class=""text show-more__control clickable...",111161
7,16117882,18 February 2008,[<span>10</span>],"[<a class=""title"" href=""/review/rw1822343/?ref...","[<div class=""text show-more__control clickable...",111161
8,129753872,23 February 2021,[<span>9</span>],"[<a class=""title"" href=""/review/rw6627363/?ref...","[<div class=""text show-more__control"">You have...",111161
9,146338622,4 December 2021,[<span>9</span>],"[<a class=""title"" href=""/review/rw7613597/?ref...","[<div class=""text show-more__control"">This is ...",111161


In [4]:
# Assign each column
score = data['reviewScore']
title = data['reviewTitle']
review = data['userReview']
movieid = data['movieId']

# Initialize the movie info list, first fill in all blanks with 0.
temp_list = [[0] * 6 for j in range(24188)]
data_new = pd.DataFrame(temp_list, columns = ('userId', 'reviewDate', 'reviewScore', 'reviewTitle', 'userReview', 'movieId'))

data_new['userId'] = data['userId'] 
data_new['reviewDate'] = data['reviewDate']

# Extract scores using regular expressions
for idx, s in enumerate(score):
    if len(re.findall('[0-9]+', s)) != 0:
        data_new['reviewScore'].iloc[idx] = re.findall('[0-9]+', s)[0]
    else:
        data_new['reviewScore'].iloc[idx] = np.nan
        
# Extract titles using regular expressions    
for idx, t in enumerate(title):
    if len(re.findall('<[\w\W]+> ([\w\W]+)',t.split('\n')[0])) != 0:
        data_new['reviewTitle'].iloc[idx] = re.findall('<[\w\W]+> ([\w\W]+)',t.split('\n')[0])[0]
    else:
        data_new['reviewTitle'].iloc[idx] = np.nan

# Extract reviews using regular expressions 
for idx, r in enumerate(review):
    if len(re.findall('<[\w\W]+>([\w\W]+)</div>',r.replace('<br/>', ''))) != 0:
        data_new['userReview'].iloc[idx] = re.findall('<[\w\W]+>([\w\W]+)</div>',r.replace('<br/>', ''))[0]
    else:
        data_new['userReview'].iloc[idx] = np.nan

# Set movieId into 7 digits
for idx, mid in enumerate(movieid):
    data_new['movieId'].iloc[idx] = str(mid).zfill(7)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [5]:
# Preview data
data_new 

Unnamed: 0,userId,reviewDate,reviewScore,reviewTitle,userReview,movieId
0,2509775,26 November 2003,10,Tied for the best movie I have ever seen,Why do I want to write the 234th comment on Th...,0111161
1,1898687,10 February 2006,10,A classic piece of unforgettable film-making.,"In its Oscar year, Shawshank Redemption (writt...",0111161
2,16161013,24 July 2010,10,Some birds aren't meant to be caged.,The Shawshank Redemption is written and direct...,0111161
3,1005460,8 February 2001,,Prepare to be moved,I have never seen such an amazing film since I...,0111161
4,997166,27 August 2002,10,Shawshank Redeems Hollywood,"Can Hollywood, usually creating things for ent...",0111161
...,...,...,...,...,...,...
24183,6729,17 June 2003,9,Changed my mind ...,This post refers to the extended DVD special e...,0099348
24184,2349843,9 May 2003,10,Read The Book,This is specifically for Tom Hull...did you ha...,0099348
24185,1945054,4 December 2002,10,One of the best films in the 90's,Dances with wolves is a very epic film.I must ...,0099348
24186,1651782,29 November 2002,10,Exquisite,They don't make 'em like this anymore. Marvelo...,0099348


In [6]:
# Data preprocess

# Drop rows or columns with NaN value more than 30%
data_new.dropna(thresh = int(data.shape[0]*0.7), axis = 1, inplace = True)
data_new.dropna(thresh = int(data.shape[1]*0.7), axis = 0, inplace = True)

# Drop duplicate data
data_new.drop_duplicates() 

# Propagate last valid observation forward to next valid backfill
dl = data_new.columns.tolist()
for col in range(data_new.shape[1]):
    data_new[dl[col]].fillna(method = 'pad', inplace = True)

In [7]:
# Export data into csv file
data_new.to_csv("preprocessed_data.csv")

data_new

Unnamed: 0,userId,reviewDate,reviewScore,reviewTitle,userReview,movieId
0,2509775,26 November 2003,10,Tied for the best movie I have ever seen,Why do I want to write the 234th comment on Th...,0111161
1,1898687,10 February 2006,10,A classic piece of unforgettable film-making.,"In its Oscar year, Shawshank Redemption (writt...",0111161
2,16161013,24 July 2010,10,Some birds aren't meant to be caged.,The Shawshank Redemption is written and direct...,0111161
3,1005460,8 February 2001,10,Prepare to be moved,I have never seen such an amazing film since I...,0111161
4,997166,27 August 2002,10,Shawshank Redeems Hollywood,"Can Hollywood, usually creating things for ent...",0111161
...,...,...,...,...,...,...
24183,6729,17 June 2003,9,Changed my mind ...,This post refers to the extended DVD special e...,0099348
24184,2349843,9 May 2003,10,Read The Book,This is specifically for Tom Hull...did you ha...,0099348
24185,1945054,4 December 2002,10,One of the best films in the 90's,Dances with wolves is a very epic film.I must ...,0099348
24186,1651782,29 November 2002,10,Exquisite,They don't make 'em like this anymore. Marvelo...,0099348
