In [2]:
import pandas as pd
import numpy as np
import re

# Import the Data

In [26]:
pro = pd.read_csv("product_details.csv")
cus = pd.read_csv("customer.csv")
ord = pd.read_csv("order_details.csv")

# Clean the Data

### 1. Product Details Data

In [3]:
pro.head()

Unnamed: 0,Uniqe Id,Product Name,Brand Name,Asin,Category,Upc Ean Code,List Price,Selling Price,Quantity,Model Number,...,Product Url,Stock,Product Details,Dimensions,Color,Ingredients,Direction To Use,Is Amazon Seller,Size Quantity Variant,Product Description
0,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",,,Sports & Outdoors | Outdoor Recreation | Skate...,,,$237.68,,,...,https://www.amazon.com/DB-Longboards-CoreFlex-...,,,,,,,Y,,
1,66d49bbed043f5be260fa9f7fbff5957,"Electronic Snap Circuits Mini Kits Classpack, ...",,,Toys & Games | Learning & Education | Science ...,,,$99.95,,55324.0,...,https://www.amazon.com/Electronic-Circuits-Cla...,,,,,,,Y,,
2,2c55cae269aebf53838484b0d7dd931a,3Doodler Create Flexy 3D Printing Filament Ref...,,,Toys & Games | Arts & Crafts | Craft Kits,,,$34.99,,,...,https://www.amazon.com/3Doodler-Plastic-Innova...,,,,,,,Y,,
3,18018b6bc416dab347b1b7db79994afa,Guillow Airplane Design Studio with Travel Cas...,,,Toys & Games | Hobbies | Models & Model Kits |...,,,$28.91,,142.0,...,https://www.amazon.com/Guillow-Airplane-Design...,,,,,,,Y,,
4,e04b990e95bf73bbe6a3fa09785d7cd0,Woodstock- Collage 500 pc Puzzle,,,Toys & Games | Puzzles | Jigsaw Puzzles,,,$17.49,,62151.0,...,https://www.amazon.com/Woodstock-Collage-500-p...,,,,,,,Y,,


In [4]:
pro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Uniqe Id               10002 non-null  object 
 1   Product Name           10002 non-null  object 
 2   Brand Name             0 non-null      float64
 3   Asin                   0 non-null      float64
 4   Category               9172 non-null   object 
 5   Upc Ean Code           34 non-null     object 
 6   List Price             0 non-null      float64
 7   Selling Price          9895 non-null   object 
 8   Quantity               0 non-null      float64
 9   Model Number           8230 non-null   object 
 10  About Product          9729 non-null   object 
 11  Product Specification  8370 non-null   object 
 12  Technical Details      9212 non-null   object 
 13  Shipping Weight        8864 non-null   object 
 14  Product Dimensions     479 non-null    object 
 15  Im

#### 1.1. Check for Null Value 

In [7]:
pro.isnull().sum()                                  # Check for null value  

Uniqe Id                     0
Product Name                 0
Brand Name               10002
Asin                     10002
Category                   830
Upc Ean Code              9968
List Price               10002
Selling Price              107
Quantity                 10002
Model Number              1772
About Product              273
Product Specification     1632
Technical Details          790
Shipping Weight           1138
Product Dimensions        9523
Image                        0
Variants                  7524
Sku                      10002
Product Url                  0
Stock                    10002
Product Details          10002
Dimensions               10002
Color                    10002
Ingredients              10002
Direction To Use         10002
Is Amazon Seller             0
Size Quantity Variant    10002
Product Description      10002
dtype: int64

###### There are many columns with no value at all and also columns with huge amount of missing value, so i decided to drop most of the column
###### and keep those which i find necessary and not have so much missing values

In [27]:
pro = pro.iloc[:, [0,1,4,7,13,25]]                  # Drop columns with no value

#### 1.2. Deal with 'Category' column

###### There are multiple category in 1 cell, separated by a '|', so I decided to create a new Table named Category with mulitiple categories display in one column only

In [34]:
pro['Category'] = pro['Category'].ffill()  
cat = pro[['Uniqe Id','Category']].copy()                        # Create a new table with 'Unique ID' and 'Category' only
cat['Category']= cat['Category'].str.split('|')                  # There are multiple category in 1 cell, separated by a '|' , so I split them
cat = cat.explode('Category').reset_index(drop=True)             # Display each of the category in one row only
cat.head()

Unnamed: 0,Uniqe Id,Category
0,4c69b61db1fc16e7013b43fc926e502d,Sports & Outdoors
1,4c69b61db1fc16e7013b43fc926e502d,Outdoor Recreation
2,4c69b61db1fc16e7013b43fc926e502d,"Skates, Skateboards & Scooters"
3,4c69b61db1fc16e7013b43fc926e502d,Skateboarding
4,4c69b61db1fc16e7013b43fc926e502d,Standard Skateboards & Longboards


#### 1.3. Deal with 'Selling Price' column

In [29]:
pro['Selling Price'] = pro['Selling Price'].str.replace('$', '')                # Remove the $ symbol from Selling Price
pro['Selling Price'] = pro['Selling Price'].ffill()                             # Fill the 'Selling Price' null values with the above value

# Converts messily formatted cells to standard formatting (From '12.99 - 24.99','1,222.99','12 99', ... to 12.99)
pattern1 = r'^\d{1,3}\.\d{2}$'                                # 12.99 - standard formatting
pattern2 = r'^\d{1,3}\.\d{2} - \d{1,3}\.\d{2}$'               # 12.99 - 24.99
pattern3 = r'^\d{1,3}\,\d{1,3}\.\d{2}$'                       # 1,222.99
pattern4 = r'^\d{1,3}\.\d{2}  \d{1,3} \. \d{2}$'              # 12.99  12 . 99 
pattern5 = r'^Total price:$'                                  # Total Price
pattern6 = r'^\d{1,3}\.\d{2} - \d{1,3}\,\d{1,3}\.\d{2}$'      # 12.99 - 1,111.99
pattern7 = r'^\d{1,3}\.\d{2} \d{1,3}\.\d{2}$'                 # 12.99 12.99 
pattern8 = r'^ \d{1,3} \d{1,3}$'                              # 12 99 
pattern9 = r'^ \d{1,3} \d{1,3} \d{1,3}\.\d{2}.*'              # 19 99 39.95 #listPriceLegalMe ...
pattern10 = r'^\d{1,3}\.\d{1,3}  \d{1,3}$'                    # 8.99 8


for i, data in pro['Selling Price'].items():
    if re.match(pattern1, data):
        pro.loc[i, 'Selling Price'] = data
    elif re.match(pattern2, data):
        price_range = data.split('-')
        lower_bound = price_range[0]
        upper_bound = price_range[1]
        pro.loc[i, 'Selling Price'] = (float(lower_bound) + float(upper_bound)) / 2
    elif re.match(pattern3, data):
        pro.loc[i, 'Selling Price'] = data.replace(',', '')
    elif re.match(pattern4, data):
        values = data.split()
        pro.loc[i, 'Selling Price'] = values[0]
    elif data == 'Total price:':
        pro.loc[i, 'Selling Price'] = pro.loc[i-1, 'Selling Price']
    elif re.match(pattern6, data):
        price_range = data.split('-')
        lower_bound = price_range[0]
        upper_bound = price_range[1].replace(',', '')
        pro.loc[i, 'Selling Price'] = (float(lower_bound) + float(upper_bound)) / 2
    elif re.match(pattern7, data):
        pro.loc[i, 'Selling Price'] = data.split(' ')[0]
    elif re.match(pattern8, data):
        pro.loc[i, 'Selling Price'] = data.replace(' ', '',1).replace(' ', '.',2)
    elif re.match(pattern9, data):
        price_range = data.split()
        lower_bound = price_range[0] + '.' + price_range[1]
        upper_bound = price_range[2]
        pro.loc[i, 'Selling Price'] = (float(lower_bound) + float(upper_bound)) / 2
    elif re.match(pattern10, data):
        pro.loc[i, 'Selling Price'] = data.split()[0]
    else:
        pro.loc[i, 'Selling Price'] = pro.loc[i-1, 'Selling Price']

#### 1.4. Deal with 'Shipping Weight' column

In [30]:
pro['Shipping Weight'] = pro['Shipping Weight'].ffill()                       # Fill 'Shipping Weight' null value with the above value
pro.loc[1619, 'Shipping Weight'] = pro.loc[1618, 'Shipping Weight']           # There is a index that has '.pounds' only so i replace with the above value
pro['Shipping Weight'] = pro['Shipping Weight'].str.replace(',', '.')         # There are some index with '1,88' instead of '1.88', so i replace ',' with '.'

# Turn all value to pounds (the index with ounces will be devided by 16)
for i, data in pro['Shipping Weight'].items():
    if 'pound' in data:
        pro.loc[i, 'Shipping Weight'] = float(data.split()[0])
    elif 'ounce' in data:
        pro.loc[i, 'Shipping Weight'] = float(data.split()[0])/16
    else:
        pro.loc[i, 'Shipping Weight'] = pro.loc[i-1, 'Selling Price']

#### 1.5. Deal with 'Subscription Status' column

In [31]:
# Change value from 'Subscription Status' column to one format
pro['Is Amazon Seller'] = pro['Is Amazon Seller'].str.replace('Y', 'Yes')
pro['Is Amazon Seller'] = pro['Is Amazon Seller'].str.replace('N', 'No')

#### 1.6. Check for Duplicated Value

In [32]:
pro.duplicated().sum()

0

#### 1.7. Convert all columns to suitable type

In [35]:
pro['Selling Price'] = pro['Selling Price'].astype('float')
pro['Shipping Weight'] = pro['Shipping Weight'].astype('float')
pro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Uniqe Id          10002 non-null  object 
 1   Product Name      10002 non-null  object 
 2   Category          10002 non-null  object 
 3   Selling Price     10002 non-null  float64
 4   Shipping Weight   10002 non-null  float64
 5   Is Amazon Seller  10002 non-null  object 
dtypes: float64(2), object(4)
memory usage: 469.0+ KB


### 2. Customer Data

In [36]:
cus.head()

Unnamed: 0,ID,Age,Gender,Location
0,sdb5huosnxxfkffuy35wlz610rs8coc7,55,Male,Kentucky
1,aknjq1s2spcia7idfw5wepumiz6deeko,19,Male,Maine
2,pnoo0r4kekut1cd6ipwhhf2wrvcz2v2t,50,Male,Massachusetts
3,tmmqqu7wh0u0ahc6mop0topxirvhgoby,21,Male,Rhode Island
4,jmrgfqd25k8dbenexmbbygwzwbgv4abw,45,Male,Oregon


In [37]:
cus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        3900 non-null   object
 1   Age       3900 non-null   int64 
 2   Gender    3900 non-null   object
 3   Location  3900 non-null   object
dtypes: int64(1), object(3)
memory usage: 122.0+ KB


###### The Customer Data is pretty nice with no Null Value and the columns are at their suitable types, so I just gonna check for the Duplicated Value only

#### 2.1. Check for Duplcated Value

In [38]:
cus.duplicated().sum()

0

### 3. Order Details Data

In [39]:
ord.head()

Unnamed: 0,Customer ID,Product ID,Order ID,Order Quantity,Review Rating,Payment Method,Subscription Status,Shipping Type,Discount,Date
0,cmqva4sucpxicgbcuhuiuwmbcuamldsq,d25a70e80b554fdcc4d53a7939a1cd81,qibq5bx9bjf6xlzeu3o7s1poajjxrizy,2,4.8,PayPal,No,Store Pickup,38.21%,2022-01-05
1,7gvjwveptda4cquw00s5j6krp6kperzi,325be9d8fef8ba85153e9f74c42269ba,39flqqbibqqoat80qxzt5ple84f8mdtg,14,4.4,Bank Transfer,No,Next Day Air,4.18%,2022-01-04
2,el5ara0cqvunvbm0pczhj5noc0uulrtj,af11d5f530665e1500ec2dcfe72a6fe2,cgmo5mgpbqtdyj5iv6s0cubprzur2i9c,7,3.7,Cash,No,2-Day Shipping,20.07%,2022-01-09
3,roxarwthqkzvk3usslyutpfaxgu9awiy,a047a3c2aa91a3d30b71a97e14b0aacf,5gdnbfsfgyeoxmve2p06rrb4fd5hl5cu,9,4.9,PayPal,Yes,2-Day Shipping,5.23%,2022-01-25
4,cbkiyqnvlnibrfhdwupmdavvrwu7zyhj,d3bafe51896376b23424226386ca8820,cyf8z5ljftql2o1qz71gdyvoefmacmlm,7,4.8,Bank Transfer,No,Standard,24.4%,2022-01-31


In [40]:
ord.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12137 entries, 0 to 12136
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Customer ID          12137 non-null  object 
 1   Product ID           12137 non-null  object 
 2   Order ID             12137 non-null  object 
 3   Order Quantity       12137 non-null  int64  
 4   Review Rating        12137 non-null  float64
 5   Payment Method       12137 non-null  object 
 6   Subscription Status  11773 non-null  object 
 7   Shipping Type        11922 non-null  object 
 8   Discount             12087 non-null  object 
 9   Date                 12137 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 948.3+ KB


#### 3.1. Check for Null Value

In [42]:
ord.isnull().sum()

Customer ID              0
Product ID               0
Order ID                 0
Order Quantity           0
Review Rating            0
Payment Method           0
Subscription Status    364
Shipping Type          215
Discount                50
Date                     0
dtype: int64

###### The maximum Null Value of each column is 364, not  so mmuch in comparison with 12137 rows, so I decided to fill them with the value from the aboved rows

In [45]:
ord['Subscription Status'] = ord['Subscription Status'].ffill()
ord['Shipping Type'] = ord['Shipping Type'].ffill()
ord['Discount'] = ord['Discount'].ffill()

#### 3.2. Deal with 'Subscription Status' columns

In [47]:
ord['Subscription Status'].unique()

array(['No', 'Yes', 'Y', 'Ye', 'no', 'Yess'], dtype=object)

###### The format of the column is a little bit messy, so I convert them in 'Yes' and 'No' only

In [49]:
for i, data in ord['Subscription Status'].items():
    if data == 'Y':
        ord.loc[i, 'Subscription Status'] = 'Yes'
    elif data == 'Ye':
        ord.loc[i, 'Subscription Status'] = 'Yes'
    elif data == 'Yess':
        ord.loc[i, 'Subscription Status'] = 'Yes'
    elif data == 'no':
        ord.loc[i, 'Subscription Status'] = 'No'
    else: ord.loc[i, 'Subscription Status'] = data

#### 3.3. Deal with 'Discount' column

In [52]:
ord['Discount'] = ord['Discount'].str.replace('%', '')          # Remove % symbol from 'Discount' column

#### 3.3. Convert all columns to suitable types

In [54]:
ord['Date'] = ord['Date'].astype('datetime64[ns]')
ord['Discount'] = (ord['Discount'].astype('float'))/100

### 4. Save the Data for Power BI

In [89]:
# Rename the columns to show similarity across tables
pro = pro.rename(columns = {'Uniqe Id' : 'Product ID'})
cus = cus.rename(columns = {'ID' : 'Customer ID'})
cat = cat.rename(columns = {'Uniqe Id' : 'Product ID'})

ord.to_csv('D:\Porfolio\Project 1 - E-Commerce - Draft\Cleaned Data\Order.csv', index=False)
cus.to_csv('D:\Porfolio\Project 1 - E-Commerce - Draft\Cleaned Data\Cutomer.csv', index=False)
cat.to_csv('D:\Porfolio\Project 1 - E-Commerce - Draft\Cleaned Data\Category.csv', index=False)
pro.to_csv('D:\Porfolio\Project 1 - E-Commerce - Draft\Cleaned Data\Product.csv', index=False)

In [90]:
loca.to_csv('D:\Porfolio\Project 1 - E-Commerce - Draft\Cleaned Data\Location Group.csv', index=False)
ord_cat.to_csv('D:\Porfolio\Project 1 - E-Commerce - Draft\Cleaned Data\Order - Category.csv', index=False)
ord_pro.to_csv('D:\Porfolio\Project 1 - E-Commerce - Draft\Cleaned Data\Order - Product.csv', index=False)

### 5. Join the Data for Further Calculation

In [59]:
ord_pro = pd.merge(ord, pro, on='Product ID')
ord_cat = pd.merge(ord, cat, on='Product ID')
ord_cus = pd.merge(ord, cus, on='Customer ID')

### 6. Some overview KPIs and Analyses

#### 6.1 KPIs

In [60]:
# 1. Total Order
total_order = ord['Order ID'].nunique()

# 2. Unique Customer
unique_customer = ord['Customer ID'].nunique()

# 3. Unique Product
unique_product = ord['Product ID'].nunique()

# 4. Total Item Order
total_item_order = ord['Order Quantity'].sum()

# 5. Total Revenue
ord_pro['Payment Value'] = ord_pro['Order Quantity'] * ord_pro['Selling Price'] * ord_pro['Discount']
total_revenue = ord_pro['Payment Value'].sum().round(decimals=2)

# 6. Average Order Value
average_order_value = (total_revenue / total_order).round(decimals=2)

# 7. Item Ordered per Customer
item_ordered_pre_customer = (total_item_order / unique_customer).round(decimals=2)

# 8. Average Review Score
average_review_score = ord['Review Rating'].mean().round(decimals=2)

# 9. Average Discount Value
average_discount = ord['Discount'].mean().round(decimals=2)*100
average_discount_value = f"{average_discount}%"

In [65]:
KPIs = pd.DataFrame({'KPIs':[
    total_order,
    unique_customer,
    unique_product,
    total_item_order,
    total_revenue,
    average_order_value,
    item_ordered_pre_customer,
    average_review_score,
    average_discount_value
    ]},index=['Total Order', 
              'Unique Customers',
              'Unique Product',
              'Total Item Ordered',
              'Total Revenue',
              'Average Order Value',
              'Item Ordered per Customer',
              'Average Review Score',
              'Average Discount Value' ])
KPIs.round(decimals=2)

Unnamed: 0,KPIs
Total Order,12137
Unique Customers,3723
Unique Product,6983
Total Item Ordered,94885
Total Revenue,666578.16
Average Order Value,54.92
Item Ordered per Customer,25.49
Average Review Score,3.75
Average Discount Value,18.0%


#### 6.2. Number of Orders over Time

In [67]:
order_year = ord['Date'].dt.year
order_month = ord['Date'].dt.month

orders_by_time = ord.groupby([order_year,order_month])['Order ID'].count()
orders_by_time

Date  Date
2019  1        90
      2        92
      3       106
      4        97
      5        91
      6        85
      7        86
      8        92
      9       112
      10      129
      11      150
      12      133
2020  1       207
      2       225
      3       246
      4       237
      5       210
      6       207
      7       198
      8       225
      9       273
      10      300
      11      351
      12      321
2021  1       303
      2       310
      3       295
      4       269
      5       254
      6       228
      7       292
      8       292
      9       333
      10      363
      11      407
      12      393
2022  1       306
      2       357
      3       352
      4       286
      5       232
      6       261
      7       281
      8       364
      9       430
      10      459
      11      426
      12      381
Name: Order ID, dtype: int64

#### 6.3. Most Ordered Category

In [68]:
ord_cat['Category'].value_counts().head(10)

Category
Toys & Games                                  8855
 Hobbies                                      1197
Home & Kitchen                                 952
Clothing, Shoes & Jewelry                      829
 Costumes                                      726
 Games & Accessories                           725
 Costumes & Accessories                        708
Sports & Outdoors                              687
 Remote & App Controlled Vehicles & Parts      634
 Dress Up & Pretend Play                       601
Name: count, dtype: int64

#### 6.4. States with most Orders

In [87]:
loca = ord_cus['Location'].value_counts()
loca = loca.reset_index()
loca.columns = ['Location', 'Count']
loca['Order Range'] = pd.cut(loca['Count'], [0,190,220,250,280,319], labels = ['< 190', '190 - 220', '220 - 250', '250 - 280', '> 280'])
loca.head(10)

Unnamed: 0,Location,Count,Order Range
0,Montana,319,> 280
1,Illinois,296,> 280
2,California,285,> 280
3,Minnesota,284,> 280
4,Vermont,268,250 - 280
5,Georgia,267,250 - 280
6,Alabama,267,250 - 280
7,Idaho,266,250 - 280
8,Mississippi,266,250 - 280
9,Maryland,264,250 - 280
