In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


[nltk_data] Downloading package stopwords to C:\Users\M4
[nltk_data]     Tech\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [3]:
df = pd.read_csv('../data/Reviews.csv')
print(f"Original shape: {df.shape}")
df.head()


Original shape: (568454, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
df = df[df['Score'] != 3]


In [5]:
df['Sentiment'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)
df[['Score', 'Sentiment']].head()


Unnamed: 0,Score,Sentiment
0,5,1
1,1,0
2,4,1
3,2,0
4,5,1


In [6]:
df.dropna(subset=['Text'], inplace=True)
df.drop_duplicates(subset=['UserId', 'Time', 'Text'], inplace=True)


In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)              # Remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text).strip()             # Remove extra spaces
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


In [8]:
df['clean_text'] = df['Text'].apply(clean_text)
df[['Text', 'clean_text']].sample(3)


Unnamed: 0,Text,clean_text
104052,I ordered 2 Reverse Variegated Spider Plants o...,order revers varieg spider plant may arriv fou...
64591,Nothing short of HEAVENLY. I've even made fri...,noth short heavenli ive even made fri rice use...
281865,Jamaica Me Crazy is my new favorite. The aroma...,jamaica crazi new favorit aroma alon make smil...


In [9]:
print(f"Processed shape: {df.shape}")


Processed shape: (364133, 12)


In [11]:
df.to_csv('../data/processed/reviews_clean.csv', index=False)


In [12]:
X = df['clean_text']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})


In [13]:
train_df.to_csv('../data/processed/train.csv', index=False)
test_df.to_csv('../data/processed/test.csv', index=False)
