# Data Preparation

In [4]:
import pandas as pd # load and manipulate data
import numpy as np # calculation
import matplotlib.pyplot as plt # visualize data
from sklearn.preprocessing import OneHotEncoder # To one-hot encode variables
import category_encoders as ce # To target encode high cardinality variables

## Load raw dataset

In [27]:
demand = pd.read_csv('shopee_raw_data.csv', encoding='latin1',dtype={'product_id': str})

## Overview

In [28]:
demand.head()

Unnamed: 0,date,product_id,product_name,product_category,brand,traffic,impressions,payment,revenue,product_ad_spend,...,comment_received,product_rating,high_rating,high_discount,high_comment,avg_category_rate,avg_category_comment,week_of_month,wday,conversion_rate
0,5/1/2024,10070408774,BÃÂ¬nh ÃÂa D?ng 0.75L - Nh?p kh?u chÃÂ­nh ...,6. Other Melitta products,Melitta,0,0,0,0,0,...,0,0.0,0,0,0,4.9,8,1,3,0.0
1,5/2/2024,10070408774,BÃÂ¬nh ÃÂa D?ng 0.75L - Nh?p kh?u chÃÂ­nh ...,6. Other Melitta products,Melitta,0,0,0,0,0,...,0,0.0,0,0,0,4.9,8,1,4,0.0
2,5/3/2024,10070408774,BÃÂ¬nh ÃÂa D?ng 0.75L - Nh?p kh?u chÃÂ­nh ...,6. Other Melitta products,Melitta,0,0,0,0,0,...,0,0.0,0,0,0,4.9,8,1,5,0.0
3,5/4/2024,10070408774,BÃÂ¬nh ÃÂa D?ng 0.75L - Nh?p kh?u chÃÂ­nh ...,6. Other Melitta products,Melitta,0,0,0,0,0,...,0,0.0,0,0,0,4.9,8,1,6,0.0
4,5/5/2024,10070408774,BÃÂ¬nh ÃÂa D?ng 0.75L - Nh?p kh?u chÃÂ­nh ...,6. Other Melitta products,Melitta,0,0,0,0,0,...,0,0.0,0,0,0,4.9,8,1,7,0.0


## Standardize data types

In [29]:
demand.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28034 entries, 0 to 28033
Data columns (total 40 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   date                       28034 non-null  object 
 1   product_id                 28034 non-null  object 
 2   product_name               28034 non-null  object 
 3   product_category           28034 non-null  object 
 4   brand                      28034 non-null  object 
 5   traffic                    28034 non-null  int64  
 6   impressions                28034 non-null  int64  
 7   payment                    28034 non-null  int64  
 8   revenue                    28034 non-null  int64  
 9   product_ad_spend           28034 non-null  int64  
 10  shop_ad_spend              28034 non-null  int64  
 11  auto_ad_spend              28034 non-null  int64  
 12  run_shop_ad                28034 non-null  int64  
 13  run_product_ad             28034 non-null  int

In [30]:
# Ensure the 'date' column is in datetime format
demand['date'] = pd.to_datetime(demand['date'])

# Sort the data by date, then product_id
demand = demand.sort_values(by=['product_id', 'date'])

In [31]:
def check_unique_values(df):
    unique_values = {col: df[col].nunique() for col in df.columns}
    result_df = pd.DataFrame(list(unique_values.items()), columns=["Column Name", "Unique Value Count"])
    return result_df

check_unique_values(demand)

Unnamed: 0,Column Name,Unique Value Count
0,date,214
1,product_id,131
2,product_name,133
3,product_category,20
4,brand,13
5,traffic,56
6,impressions,97
7,payment,12
8,revenue,1185
9,product_ad_spend,567


In [32]:
def check_row_misalignment(df, col1, col2):
    misaligned = df.groupby(col1)[col2].nunique().reset_index()
    misaligned = misaligned[misaligned[col2] > 1]
    return misaligned

misaligned_rows = check_row_misalignment(demand, 'product_id', 'product_name')
print(misaligned_rows)

     product_id  product_name
43  15765129329             2
76  21382736254             2
81  22150796991             2


In [None]:
product_ids_to_filter = [15765129329, 21382736254, 22150796991]
filtered_by_product_id = demand[demand['product_id'].isin(product_ids_to_filter)]