In [1]:
import pandas as pd
df = pd.read_csv('../data/extended_dataset/Reviews.csv')
df.tail()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...
568453,568454,B001LR2CU2,A3LGQPJCZVL9UC,srfell17,0,0,5,1338422400,Great Honey,"I am very satisfied ,product is as advertised,..."


In [13]:
# Check available scores
print("Unique Score values:", df['Score'].unique())
print(df['Score'].value_counts())


Unique Score values: [5 1 4 2 3]
Score
5    363122
4     80655
1     52268
3     42640
2     29769
Name: count, dtype: int64


In [14]:
print(df.columns.tolist())


['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text']


View & Inspect the Data

In [15]:


# Check last rows
print(df.tail())
# Summary statistics for numeric columns
print(df.describe())

# Data summary (column names, types, missing values)
print(df.info())




            Id   ProductId          UserId              ProfileName  \
568449  568450  B001EO7N10  A28KG5XORO54AY         Lettie D. Carter   
568450  568451  B003S1WTCU  A3I8AFVPEE8KI5                R. Sawyer   
568451  568452  B004I613EE  A121AA1GQV751Z            pksd "pk_007"   
568452  568453  B004I613EE   A3IBEVCTXKNOH  Kathy A. Welch "katwel"   
568453  568454  B001LR2CU2  A3LGQPJCZVL9UC                 srfell17   

        HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
568449                     0                       0      5  1299628800   
568450                     0                       0      2  1331251200   
568451                     2                       2      5  1329782400   
568452                     1                       1      5  1331596800   
568453                     0                       0      5  1338422400   

                                   Summary  \
568449                 Will not do without   
568450                        

Handle Missing Data

In [16]:
print(df.isnull().sum())


Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64


View datas

In [17]:
print(df.columns.tolist())



['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text']


Drop Unnecessary Columns/Rows

In [18]:
df = df.dropna(subset=['Score', 'Text'])

Remove Duplicates

In [19]:
df = df.drop_duplicates()

View missing datas

In [20]:
print(df.isnull().sum())


Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64


In [21]:
df=df[['Score', 'Text']]

In [22]:
import re
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources (only first time)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean and normalize text
def clean_text(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove URLs and HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)   # URLs
    text = re.sub(r'<.*?>', '', text)                    # HTML
    
    # 3. Remove emojis and non-alphabetic characters
    text = re.sub(r'[^\w\s]', '', text)                  # punctuation
    text = re.sub(r'[\d_]', '', text)                    # numbers and underscores
    text = re.sub(r'\s+', ' ', text).strip()             # extra spaces
    
    # 4. Tokenize
    tokens = nltk.word_tokenize(text)
    
    # 5. Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # 6. Lemmatize (convert words to base form)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # 7. Rejoin tokens
    cleaned_text = ' '.join(tokens)
    return cleaned_text


# Apply cleaning to your dataset
df['Cleaned_text'] = df['Text'].astype(str).apply(clean_text)

# ✅ Remove any remaining numbers (like "2" in "2 good product")
df['Cleaned_text'] = df['Cleaned_text'].apply(lambda x: re.sub(r'\d+', '', x))

# 8. Filter out reviews with <3 words or extremely long text (>150 words)
df = df[df['Cleaned_text'].apply(lambda x: len(x.split()) >= 3)]
df = df[df['Cleaned_text'].apply(lambda x: len(x.split()) <= 150)]

print("⭐ Remaining samples after cleaning:", len(df))
df[['Cleaned_text', 'Score']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91940\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91940\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\91940\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91940\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\91940\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


⭐ Remaining samples after cleaning: 556354


Unnamed: 0,Cleaned_text,Score
0,bought several vitality canned dog food produc...,5
1,product arrived labeled jumbo salted peanutsth...,1
2,confection around century light pillowy citrus...,4
3,looking secret ingredient robitussin believe f...,2
4,great taffy great price wide assortment yummy ...,5


In [24]:
df=df[['Score', 'Cleaned_text']]

In [25]:
print(df.head())


   Score                                       Cleaned_text
0      5  bought several vitality canned dog food produc...
1      1  product arrived labeled jumbo salted peanutsth...
2      4  confection around century light pillowy citrus...
3      2  looking secret ingredient robitussin believe f...
4      5  great taffy great price wide assortment yummy ...


In [26]:
df.to_csv("../data/cleaned_dataset/cleaned_data.csv", index=False)

In [35]:
# Check available scores
print("Unique Score values:", df['Score'].unique())
print(df['Score'].value_counts())


Unique Score values: [5 1 4 2 3]
Score
5    363122
4     80655
1     52268
3     42640
2     29769
Name: count, dtype: int64


In [None]:
import pandas as pd
df = pd.read_csv('../data/cleaned_dataset/cleaned_data.csv')
df.head()