In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("./Reviews.csv")
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
# convert str into ID
df['Product_ID'] = pd.Categorical(df['ProductId'], categories=df['ProductId'].unique()).codes
df['User_ID'] = pd.Categorical(df['UserId'], categories=df['UserId'].unique()).codes
df['Time_ID'] = pd.Categorical(df['Time'], categories=df['Time'].unique()).codes
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Product_ID,User_ID,Time_ID
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,0,0,0
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,1,1,1
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,2,2,2
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,3,3,3
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,4,4,4


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

stemmer=SnowballStemmer("english")
stop_words=stopwords.words('english')

In [5]:
# Clean Text
df['CleanedText'] = df['Text'].replace(to_replace=r'@\S+',value="",regex=True)
df["CleanedText"] = df['CleanedText'].replace(to_replace=r'[^A-Za-z0-9]+',value=" ",regex=True)
df["CleanedText"] = df["CleanedText"].apply(lambda x: x.split())
df["CleanedText"] = df['CleanedText'].apply(lambda x: [item for item in x if item not in stop_words])
df['CleanedText'] = df['CleanedText'].apply(lambda x: [stemmer.stem(w) for w in x])
df['CleanedText'] = df['CleanedText'].apply(' '.join)
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Product_ID,User_ID,Time_ID,CleanedText
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,0,0,0,i bought sever vital can dog food product foun...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,1,1,1,product arriv label jumbo salt peanut peanut a...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,2,2,2,this confect around centuri it light pillowi c...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,3,3,3,if look secret ingredi robitussin i believ i f...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,4,4,4,great taffi great price there wide assort yumm...


In [6]:
# select specifical columns and drop the duplicates rows
processed_df = df[['Product_ID', 'User_ID', 'Time_ID', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'CleanedText', 'Score']].copy()
processed_df = processed_df.drop_duplicates(subset={"Product_ID","User_ID","Time_ID","CleanedText"}, keep='first', inplace=False)
processed_df.head()

Unnamed: 0,Product_ID,User_ID,Time_ID,HelpfulnessNumerator,HelpfulnessDenominator,CleanedText,Score
0,0,0,0,1,1,i bought sever vital can dog food product foun...,5
1,1,1,1,0,0,product arriv label jumbo salt peanut peanut a...,1
2,2,2,2,1,1,this confect around centuri it light pillowi c...,4
3,3,3,3,3,3,if look secret ingredi robitussin i believ i f...,2
4,4,4,4,0,0,great taffi great price there wide assort yumm...,5


In [7]:
# See how many rows have been dropped
print(processed_df.shape)
print(df.shape)

(567107, 7)
(568454, 14)


In [8]:
# split rows according to their 'Score'
from sklearn.utils import shuffle

df_diff_score = [0] * 5
for i in range(5):
    df_diff_score[i] = processed_df[processed_df['Score'] == i+1].copy()
    df_diff_score[i] = shuffle(df_diff_score[i])
    print(df_diff_score[i].shape)

(51948, 7)
(29751, 7)
(42546, 7)
(80536, 7)
(362326, 7)


In [9]:
# generate train_set and test_set
train_set_number = 20000
test_set_number = 5000

train_set = pd.concat([df_diff_score[i][:train_set_number] for i in range(5)])
test_set = pd.concat([df_diff_score[i][train_set_number:train_set_number+test_set_number] for i in range(5)])

In [12]:
# Save them
train_set.to_csv(r'data/train_set.csv', index=False)
test_set.to_csv(r'data/test_set.csv', index=False)

In [13]:
# Local dataset
local_train_set_number = 500
local_test_set_number = 100

local_train_set = pd.concat([df_diff_score[i][:local_train_set_number] for i in range(5)])
local_test_set = pd.concat([df_diff_score[i][local_train_set_number:local_train_set_number+local_test_set_number] for i in range(5)])

local_train_set.to_csv(r'data/local_train_set.csv', index=False)
local_test_set.to_csv(r'data/local_test_set.csv', index=False)