In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import Dependencies

In [25]:
import os
import pandas as pd
import numpy as np
import sklearn
import matplotlib as plt
import seaborn as sns
from IPython import display

## Data Collection & Preprocessing

Combine reviews 0-end

In [35]:
import pandas as pd

file_paths = [
    '/content/drive/MyDrive/reviews_0-250.csv',
    '/content/drive/MyDrive/reviews_250-500.csv',
    '/content/drive/MyDrive/reviews_500-750.csv',
    '/content/drive/MyDrive/reviews_750-1250.csv',
    '/content/drive/MyDrive/reviews_1250-end.csv'
]

columns_to_keep = [
    "rating",
    "is_recommended",
    "helpfulness",
    "total_feedback_count",
    "total_neg_feedback_count",
    "total_pos_feedback_count",
    "review_text",
    "review_title",
    "product_id",
    "product_name",
    "brand_name",
    "price_usd"
]

dataframes = []

for path in file_paths:
    try:
        df = pd.read_csv(path, usecols=columns_to_keep, na_values='?')
        dataframes.append(df)
    except Exception as e:
        print(f"Error reading {path}: {e}")

if dataframes:
    combined_data = pd.concat(dataframes, axis=0, ignore_index=True)

    print("Combined dataset shape:", combined_data.shape)

    output_path = '/content/drive/MyDrive/combined_reviews_filtered.csv'
    try:
        combined_data.to_csv(output_path, index=False)
        print(f"Filtered dataset saved to {output_path}")
    except Exception as e:
        print(f"Error saving the filtered dataset: {e}")
else:
    print("No data was loaded. Please check the file paths.")


Combined dataset shape: (1094411, 12)
Filtered dataset saved to /content/drive/MyDrive/combined_reviews_filtered.csv


In [36]:
combined_data.describe()

Unnamed: 0,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,price_usd
count,1094411.0,926423.0,532819.0,1094411.0,1094411.0,1094411.0,1094411.0
mean,4.299158,0.839962,0.767782,4.177126,0.8948695,3.282257,49.00838
std,1.149444,0.366642,0.317164,22.71524,5.288943,19.67482,40.04338
min,1.0,0.0,0.0,0.0,0.0,0.0,3.0
25%,4.0,1.0,0.652174,0.0,0.0,0.0,25.0
50%,5.0,1.0,0.928571,0.0,0.0,0.0,39.0
75%,5.0,1.0,1.0,3.0,1.0,3.0,62.0
max,5.0,1.0,1.0,5464.0,1159.0,5050.0,1900.0


In [38]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094411 entries, 0 to 1094410
Data columns (total 12 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   rating                    1094411 non-null  int64  
 1   is_recommended            926423 non-null   float64
 2   helpfulness               532819 non-null   float64
 3   total_feedback_count      1094411 non-null  int64  
 4   total_neg_feedback_count  1094411 non-null  int64  
 5   total_pos_feedback_count  1094411 non-null  int64  
 6   review_text               1092967 non-null  object 
 7   review_title              783753 non-null   object 
 8   product_id                1094411 non-null  object 
 9   product_name              1094411 non-null  object 
 10  brand_name                1094411 non-null  object 
 11  price_usd                 1094411 non-null  float64
dtypes: float64(3), int64(4), object(5)
memory usage: 100.2+ MB


In [39]:
combined_data.nunique()

Unnamed: 0,0
rating,5
is_recommended,2
helpfulness,3767
total_feedback_count,676
total_neg_feedback_count,259
total_pos_feedback_count,590
review_text,969419
review_title,364104
product_id,2351
product_name,2334


In [40]:
combined_data.isna().sum()

Unnamed: 0,0
rating,0
is_recommended,167988
helpfulness,561592
total_feedback_count,0
total_neg_feedback_count,0
total_pos_feedback_count,0
review_text,1444
review_title,310658
product_id,0
product_name,0


## Handling the Missing Values

In [41]:
# Impute missing values for numerical columns (e.g., mean, median)
df['is_recommended'].fillna(df['is_recommended'].mode()[0], inplace=True)  # Impute with mode
df['helpfulness'].fillna(df['helpfulness'].median(), inplace=True)  # Impute with median

# Impute missing values for categorical columns (e.g., "No Review", "No Title")
df['review_text'].fillna('No Review', inplace=True)
df['review_title'].fillna('No Title', inplace=True)

# Drop rows with missing values in critical columns
df.dropna(subset=['review_text', 'review_title'], inplace=True)  # Drop rows where review_text or review_title is missing

# Alternatively, drop rows with missing values in all columns
# df.dropna(inplace=True)  # This will drop any row with at least one missing value


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['is_recommended'].fillna(df['is_recommended'].mode()[0], inplace=True)  # Impute with mode
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['helpfulness'].fillna(df['helpfulness'].median(), inplace=True)  # Impute with median
The behavior will change in pandas 3.0. This inpl

Check the number of missing values in each column

In [42]:
df.isna().sum()

Unnamed: 0,0
rating,0
is_recommended,0
helpfulness,0
total_feedback_count,0
total_neg_feedback_count,0
total_pos_feedback_count,0
review_text,0
review_title,0
product_id,0
product_name,0


Export cleaned and processed data to a new CSV file

In [43]:
output_path = '/content/drive/MyDrive/cleaned_reviews.csv'

try:
    df.to_csv(output_path, index=False)  # index=False to avoid writing row indices
    print(f"Cleaned dataset saved to {output_path}")
except Exception as e:
    print(f"Error saving the cleaned dataset: {e}")

Cleaned dataset saved to /content/drive/MyDrive/cleaned_reviews.csv
