# 1) Import packages and data

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
%matplotlib inline

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
# get data from drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/My Drive/Colab Notebooks/

/content/drive/My Drive/Colab Notebooks


In [5]:
df = pd.read_csv('Cocoon_official_15042023.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3826 entries, 0 to 3825
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Name           3694 non-null   object
 1   Price          3619 non-null   object
 2   Sales          3661 non-null   object
 3   URL            3826 non-null   object
 4   Rating         3586 non-null   object
 5   Rating record  3574 non-null   object
 6   Date           3826 non-null   object
 7   Discount       563 non-null    object
dtypes: object(8)
memory usage: 239.2+ KB


# 2) Simple processing on null and unnecessary data

1) Eliminate 'URL' column as I will not be needing it in the future

In [7]:
df=df.drop(columns=['URL'])

2) Filter to eliminate null value in "Name". For sales data, later on, it will use filter code which is only aplicaple with nonnull data (step 4)

In [8]:
df = df.dropna(subset=['Name', 'Sales'])

In [9]:
df = df.dropna(subset=['Name'])

In [10]:
print(df)

                                                   Name     Price  \
0     HÀNG TẶNG KHÔNG BÁN - Gel rửa mặt cà phê Đắk L...       195   
1     Gel rửa mặt cà phê Đắk Lắk Cocoon cho làn da t...       195   
2     [DEAL 1 TẶNG 1 ĐỘC QUYỀN FLASH SALE] Bơ dưỡng ...       340   
3     Combo Sáng Mịn Sạch Mụn: 1 Gel tắm bí đao Coco...       351   
4     [Mã COSCN12 giảm 8%] Bơ dưỡng thể cà phê Đắk L...       215   
...                                                 ...       ...   
3821  [Mã COSCN12 giảm 8%] Bigsize - Nước tẩy trang ...  ₫250.750   
3822  [Mã COSCN12 giảm 8%] Bigsize - Thạch hoa hồng ...  ₫327.250   
3823  [Mã COSCN12 giảm 8%] Nước hoa hồng (toner) Coc...  ₫165.750   
3824  [Mã COSCN12 giảm 8%] Tinh chất hoa hồng (serum...  ₫225.250   
3825  [Mã COSCN12 giảm 8%] Thạch hoa hồng dưỡng ẩm (...  ₫165.750   

             Sales Rating Rating record       Date  Discount  
0                     NaN           NaN   2/9/2023       NaN  
1       Đã bán 175    4.9            64   2/9

In [11]:
# check if df reach needed non-null features and check for type of data for the next step
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3661 entries, 0 to 3825
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Name           3661 non-null   object
 1   Price          3586 non-null   object
 2   Sales          3661 non-null   object
 3   Rating         3583 non-null   object
 4   Rating record  3571 non-null   object
 5   Date           3661 non-null   object
 6   Discount       534 non-null    object
dtypes: object(7)
memory usage: 228.8+ KB


Name                0
Price              75
Sales               0
Rating             78
Rating record      90
Date                0
Discount         3127
dtype: int64

# 3) Date

In [12]:
from datetime import datetime
# filter dataframe into two subset right_dates and error_dates, which are date in the right format "mm/dd/yyyy" and those aren't 
e_dates = pd.to_datetime(df['Date'], format='%m/%d/%Y', errors='coerce').isna()
error_dates = df[e_dates]

r_dates = pd.to_datetime(df['Date'], format='%m/%d/%Y', errors='coerce').notna()
right_dates = df[r_dates]

#change date in wrong format to right format
error_dates['Date'] = pd.to_datetime(error_dates['Date'], format='%d/%m/%Y').dt.strftime('%m/%d/%Y')

#combine subset back to one original one
df = pd.concat([error_dates, right_dates], ignore_index=True)

# 4) Sales

4.1) Remove "Da ban" part in "Sales" and  to make "Sales" in the same structure.

In [13]:
da_ban_data = df[df['Sales'].str.startswith('Đã bán')]
non_da_ban_data = df[~df['Sales'].str.startswith('Đã bán')]
# remove "Đã bán" part from beginning of each string
da_ban_data['Sales'] = da_ban_data['Sales'].str.replace('Đã bán', '')
# merge filtered dataset back into original dataset
df = pd.concat([da_ban_data, non_da_ban_data], ignore_index=True)

4.2) turn "k" to thousands and change data type (to make "Sales" in the same type and structure)

In [14]:
def convert_to_numeric(strings):
    if "k" in strings:
        num = float(strings.replace("k", "").replace(',', '.'))*1000
        return num
    else:
        return strings
df['Sales'] = df['Sales'].apply(convert_to_numeric)
# there are data that is empty string as initially it can be "Da ban" with no sale number
# therefore, replace those with a valid value or remove the row entirely and change column type to numeric
df['Sales'] = pd.to_numeric(df['Sales'], errors='coerce').fillna(0)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3661 entries, 0 to 3660
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           3661 non-null   object 
 1   Price          3586 non-null   object 
 2   Sales          3661 non-null   float64
 3   Rating         3583 non-null   object 
 4   Rating record  3571 non-null   object 
 5   Date           3661 non-null   object 
 6   Discount       534 non-null    object 
dtypes: float64(1), object(6)
memory usage: 200.3+ KB


In [16]:
# print the names of the products with sales equal to 0
zero_sales = df[df["Sales"] == 0.0]
zero_sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 356 to 2906
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           60 non-null     object 
 1   Price          60 non-null     object 
 2   Sales          60 non-null     float64
 3   Rating         25 non-null     object 
 4   Rating record  24 non-null     object 
 5   Date           60 non-null     object 
 6   Discount       6 non-null      object 
dtypes: float64(1), object(6)
memory usage: 3.8+ KB


# 5) Rating and Rating record


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3661 entries, 0 to 3660
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           3661 non-null   object 
 1   Price          3586 non-null   object 
 2   Sales          3661 non-null   float64
 3   Rating         3583 non-null   object 
 4   Rating record  3571 non-null   object 
 5   Date           3661 non-null   object 
 6   Discount       534 non-null    object 
dtypes: float64(1), object(6)
memory usage: 200.3+ KB


In [18]:
df['Rating record'] = df['Rating record'].fillna(value='0')

In [19]:
def convert_to_numeric(strings):
    if "k" in strings:
        num = float(strings.replace("k", "").replace(',', '.'))*1000
        return num
    else:
        return strings

In [20]:
df['Rating record'] = df['Rating record'].apply(convert_to_numeric)
# there are data that is empty string as initially it can be "Da ban" with no sale number
# therefore, replace those with a valid value or remove the row entirely and change column type to numeric
df['Rating record'] = pd.to_numeric(df['Rating record'], errors='coerce').fillna(0)

In [21]:
df

Unnamed: 0,Name,Price,Sales,Rating,Rating record,Date,Discount
0,Gel rửa mặt cà phê Đắk Lắk Cocoon cho làn da t...,195,344.0,4.9,109.0,02/15/2023,
1,HÀNG TẶNG KHÔNG BÁN - Gel rửa mặt cà phê Đắk L...,195,38.0,5,10.0,02/15/2023,
2,[Mã COSCN12 giảm 8%] Tẩy da chết cơ thể cà phê...,125,120000.0,5,44400.0,02/15/2023,
3,[Mã COSCN12 giảm 8%] Tẩy da chết môi cà phê Đắ...,75,20200.0,5,8100.0,02/15/2023,
4,[Mã COSCN12 giảm 8%] Bơ dưỡng thể cà phê Đắk L...,215,10500.0,5,4300.0,02/15/2023,
...,...,...,...,...,...,...,...
3656,[Mã COSCN12 giảm 8%] Bigsize - Nước tẩy trang ...,₫250.750,8900.0,49,3200.0,4/15/2023,15% giảm
3657,[Mã COSCN12 giảm 8%] Bigsize - Thạch hoa hồng ...,₫327.250,6900.0,49,2500.0,4/15/2023,15% giảm
3658,[Mã COSCN12 giảm 8%] Nước hoa hồng (toner) Coc...,₫165.750,13100.0,5,4800.0,4/15/2023,15% giảm
3659,[Mã COSCN12 giảm 8%] Tinh chất hoa hồng (serum...,₫225.250,4000.0,5,1500.0,4/15/2023,15% giảm


In [22]:
df['Rating'] = df['Rating'].str.replace('Chưa Có Đánh Giá', '0')

# therefore, replace those with a valid value or remove the row entirely and change column type to numeric
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce').fillna(0)
df = df.dropna(subset=['Rating'])

# 6) Price

In [23]:
# extract only the numeric part of the "Price" column
df["Price"] = df["Price"].str.extract(r"(\d+\.?\d*)").astype(float)

In [24]:
# extract only the numeric part of the "Discount" column
df["Discount"] = df["Discount"].str.extract(r"(\d+\.?\d*)").astype(float)
df.columns = df.columns.str.replace('Discount', 'Discount(%)')

# 7) Category and Product ID

In [25]:
# define a function to assign categories based on product names
def assign_category(name):
    if "cà phê" in name.lower():
        return "Coffee"
    elif "hoa hồng" in name.lower():
        return "Rose"
    elif "bưởi" in name.lower():
        return "Pomelo"
    elif "nghệ" in name.lower():
        return "Ginger"
    elif "bí đao" in name.lower():
        return "Squash"
    else:
        return "Other"

# define a function to assign categories based on product names
def assign_ini(name):
    if "cà phê" in name.lower():
        return "C"
    elif "hoa hồng" in name.lower():
        return "R"
    elif "bưởi" in name.lower():
        return "P"
    elif "nghệ" in name.lower():
        return "G"
    elif "bí đao" in name.lower():
        return "S"
    else:
        return "O"
# apply the function to create a new "category" column
df["Category"] = df["Name"].apply(lambda name: assign_category(name))
df["Ini_cat"] = df["Name"].apply(lambda name: assign_ini(name))

In [26]:
#create seperate dataset for each category
coffee_df = df[df["Category"] == "Coffee"]
rose_df = df[df["Category"] == "Rose"]
pomelo_df = df[df["Category"] == "Pomelo"]
ginger_df = df[df["Category"] == "Ginger"]
squash_df = df[df["Category"] == "Squash"]

# group by product name and assign a product ID to each category
coffee_df["Group_ID"] = coffee_df.groupby("Name").ngroup() + 1
rose_df["Group_ID"] = rose_df.groupby("Name").ngroup() + 1
pomelo_df["Group_ID"] = pomelo_df.groupby("Name").ngroup() + 1
ginger_df["Group_ID"] = ginger_df.groupby("Name").ngroup() + 1
squash_df["Group_ID"] = squash_df.groupby("Name").ngroup() + 1

In [27]:
# merge data with category type and ID back as official dataset 
df=pd.concat([coffee_df, rose_df, pomelo_df, ginger_df, squash_df], ignore_index=True)

# create Product_ID column based on category initial and category ID
df['Product_ID'] = df.apply(lambda row: str(row['Ini_cat']) + str(row['Group_ID']), axis=1)

In [28]:
discount_program = df[df["Discount(%)"].notna()]
discount_program

Unnamed: 0,Name,Price,Sales,Rating,Rating record,Date,Discount(%),Category,Ini_cat,Group_ID,Product_ID
247,HÀNG TẶNG KHÔNG BÁN - Tẩy da chết mặt cà phê Đ...,165.00,3600.0,5.0,1300.0,02/28/2023,15.0,Coffee,C,8,C8
248,[Mã COSCN12 giảm 8%] Combo làm sạch da chết và...,323.00,1600.0,5.0,589.0,02/28/2023,15.0,Coffee,C,19,C19
249,[DEAL 1 TẶNG 1 ĐỘC QUYỀN FLASH SALE] Bơ dưỡng ...,340.00,734.0,0.0,226.0,02/28/2023,15.0,Coffee,C,16,C16
250,[Mã COSCN12 giảm 8%] Tẩy da chết môi cà phê Đắ...,75.00,20400.0,5.0,8200.0,02/28/2023,15.0,Coffee,C,21,C21
252,[Mã COSCN12 giảm 8%] Tẩy da chết cơ thể cà phê...,125.00,121200.0,5.0,44700.0,02/28/2023,15.0,Coffee,C,20,C20
...,...,...,...,...,...,...,...,...,...,...,...
3656,[COMBO MUA 1 TẶNG 1 ÁP DỤNG 01/04-30/04] Kem c...,395.00,621.0,0.0,172.0,4/15/2023,43.0,Squash,S,20,S20
3657,[Mã COSCN12 giảm 8%] Bigsize - Nước bí đao cân...,250.75,5500.0,0.0,2000.0,4/15/2023,15.0,Squash,S,24,S24
3658,Tinh chất bí đao (serum) 7% Niacinamide sạch m...,250.75,4000.0,0.0,1400.0,4/15/2023,15.0,Squash,S,18,S18
3659,[Mã COSCN12 giảm 8%] Nước bí đao cân bằng da (...,165.75,17600.0,5.0,7400.0,4/15/2023,15.0,Squash,S,33,S33


In [29]:
# Final check for result of preprocessing data: 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3661 entries, 0 to 3660
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           3661 non-null   object 
 1   Price          3585 non-null   float64
 2   Sales          3661 non-null   float64
 3   Rating         3661 non-null   float64
 4   Rating record  3661 non-null   float64
 5   Date           3661 non-null   object 
 6   Discount(%)    533 non-null    float64
 7   Category       3661 non-null   object 
 8   Ini_cat        3661 non-null   object 
 9   Group_ID       3661 non-null   int64  
 10  Product_ID     3661 non-null   object 
dtypes: float64(5), int64(1), object(5)
memory usage: 314.7+ KB


In [30]:
# Record information of the final data set as "final_data_info" 
final_data_info= df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3661 entries, 0 to 3660
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           3661 non-null   object 
 1   Price          3585 non-null   float64
 2   Sales          3661 non-null   float64
 3   Rating         3661 non-null   float64
 4   Rating record  3661 non-null   float64
 5   Date           3661 non-null   object 
 6   Discount(%)    533 non-null    float64
 7   Category       3661 non-null   object 
 8   Ini_cat        3661 non-null   object 
 9   Group_ID       3661 non-null   int64  
 10  Product_ID     3661 non-null   object 
dtypes: float64(5), int64(1), object(5)
memory usage: 314.7+ KB


In [31]:
# Write the DataFrame to a CSV file
df.to_csv("Preprocessed_data.csv", index=True)
coffee_df.to_csv("Coffee_data.csv", index=True)
rose_df.to_csv("Rose_df.csv", index=True)
pomelo_df.to_csv("Pomelo_df.csv", index=True)
ginger_df.to_csv("Ginger_df.csv", index=True)
squash_df.to_csv("Squash_df.csv", index=True)

In [32]:
df

Unnamed: 0,Name,Price,Sales,Rating,Rating record,Date,Discount(%),Category,Ini_cat,Group_ID,Product_ID
0,Gel rửa mặt cà phê Đắk Lắk Cocoon cho làn da t...,195.00,344.0,4.9,109.0,02/15/2023,,Coffee,C,4,C4
1,HÀNG TẶNG KHÔNG BÁN - Gel rửa mặt cà phê Đắk L...,195.00,38.0,5.0,10.0,02/15/2023,,Coffee,C,5,C5
2,[Mã COSCN12 giảm 8%] Tẩy da chết cơ thể cà phê...,125.00,120000.0,5.0,44400.0,02/15/2023,,Coffee,C,20,C20
3,[Mã COSCN12 giảm 8%] Tẩy da chết môi cà phê Đắ...,75.00,20200.0,5.0,8100.0,02/15/2023,,Coffee,C,21,C21
4,[Mã COSCN12 giảm 8%] Bơ dưỡng thể cà phê Đắk L...,215.00,10500.0,5.0,4300.0,02/15/2023,,Coffee,C,17,C17
...,...,...,...,...,...,...,...,...,...,...,...
3656,[COMBO MUA 1 TẶNG 1 ÁP DỤNG 01/04-30/04] Kem c...,395.00,621.0,0.0,172.0,4/15/2023,43.0,Squash,S,20,S20
3657,[Mã COSCN12 giảm 8%] Bigsize - Nước bí đao cân...,250.75,5500.0,0.0,2000.0,4/15/2023,15.0,Squash,S,24,S24
3658,Tinh chất bí đao (serum) 7% Niacinamide sạch m...,250.75,4000.0,0.0,1400.0,4/15/2023,15.0,Squash,S,18,S18
3659,[Mã COSCN12 giảm 8%] Nước bí đao cân bằng da (...,165.75,17600.0,5.0,7400.0,4/15/2023,15.0,Squash,S,33,S33


In [33]:
df['Rating'].value_counts()

0.0    2006
5.0    1312
4.9     330
4.0      11
4.7       2
Name: Rating, dtype: int64

# 8) Check data information for final result


In [34]:
df1= pd.read_csv('Preprocessed_data.csv')

In [35]:
#print(df.describe())
df.describe()

Unnamed: 0,Price,Sales,Rating,Rating record,Discount(%),Group_ID
count,3585.0,3661.0,3661.0,3661.0,533.0,3661.0
mean,290.104017,8790.183556,2.248129,3336.063371,17.358349,14.312756
std,147.94029,18069.941718,2.476142,7423.129383,6.692853,8.033654
min,60.0,0.0,0.0,0.0,5.0,1.0
25%,195.0,763.0,0.0,240.0,15.0,8.0
50%,289.0,3900.0,0.0,1300.0,15.0,15.0
75%,345.0,9500.0,5.0,3400.0,15.0,20.0
max,955.0,126300.0,5.0,124800.0,48.0,36.0


In [36]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3661 entries, 0 to 3660
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     3661 non-null   int64  
 1   Name           3661 non-null   object 
 2   Price          3585 non-null   float64
 3   Sales          3661 non-null   float64
 4   Rating         3661 non-null   float64
 5   Rating record  3661 non-null   float64
 6   Date           3661 non-null   object 
 7   Discount(%)    533 non-null    float64
 8   Category       3661 non-null   object 
 9   Ini_cat        3661 non-null   object 
 10  Group_ID       3661 non-null   int64  
 11  Product_ID     3661 non-null   object 
dtypes: float64(5), int64(2), object(5)
memory usage: 343.3+ KB


In [37]:
print(df1.columns)

Index(['Unnamed: 0', 'Name', 'Price', 'Sales', 'Rating', 'Rating record',
       'Date', 'Discount(%)', 'Category', 'Ini_cat', 'Group_ID', 'Product_ID'],
      dtype='object')


In [38]:
df1=df1.drop(columns=['Unnamed: 0','Group_ID'])

In [39]:
# specify a formatting function that converts the scientific notation to a fixed-point notation
pd.options.display.float_format = '{:.2f}'.format
# Rename the columns to include count, std, and mean
stats = df1.describe()
#save result of statiscal dataset information into csv file 
stats.to_csv("Statiscal result.csv", index= True)
stats

Unnamed: 0,Price,Sales,Rating,Rating record,Discount(%)
count,3585.0,3661.0,3661.0,3661.0,533.0
mean,290.1,8790.18,2.25,3336.06,17.36
std,147.94,18069.94,2.48,7423.13,6.69
min,60.0,0.0,0.0,0.0,5.0
25%,195.0,763.0,0.0,240.0,15.0
50%,289.0,3900.0,0.0,1300.0,15.0
75%,345.0,9500.0,5.0,3400.0,15.0
max,955.0,126300.0,5.0,124800.0,48.0
