# **Cleaning and Preparation of data**

This dataset is from the Amazon Sales Report file from https://www.kaggle.com/datasets/thedevastator/unlock-profits-with-e-commerce-sales-data/data. 

It contains data capturing the fulfilment and delivery status of different transactions. However, this dataset primarily focuses on textile products while our project focuses on electronics. Hence, we will be generating synthetic data based on the constraints in the columns we are interested in in the Amazon Sales Report dataset.

In [287]:
# Importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

In [288]:
# Load datasets

df = pd.read_csv('../source/Amazon Sale Report.csv')
online_sales_df = pd.read_csv('../online_sales_edited.csv')

In [289]:
# Check the first few rows of df

df.head()

Unnamed: 0,index,Order ID,Date,Status,Fulfilment,Sales Channel,ship-service-level,Style,SKU,Category,...,currency,Amount,ship-city,ship-state,ship-postal-code,ship-country,promotion-ids,B2B,fulfilled-by,Unnamed: 22
0,0,405-8078784-5731545,04-30-22,Cancelled,Merchant,Amazon.in,Standard,SET389,SET389-KR-NP-S,Set,...,INR,647.62,MUMBAI,MAHARASHTRA,400081.0,IN,,False,Easy Ship,
1,1,171-9198151-1101146,04-30-22,Shipped - Delivered to Buyer,Merchant,Amazon.in,Standard,JNE3781,JNE3781-KR-XXXL,kurta,...,INR,406.0,BENGALURU,KARNATAKA,560085.0,IN,Amazon PLCC Free-Financing Universal Merchant ...,False,Easy Ship,
2,2,404-0687676-7273146,04-30-22,Shipped,Amazon,Amazon.in,Expedited,JNE3371,JNE3371-KR-XL,kurta,...,INR,329.0,NAVI MUMBAI,MAHARASHTRA,410210.0,IN,IN Core Free Shipping 2015/04/08 23-48-5-108,True,,
3,3,403-9615377-8133951,04-30-22,Cancelled,Merchant,Amazon.in,Standard,J0341,J0341-DR-L,Western Dress,...,INR,753.33,PUDUCHERRY,PUDUCHERRY,605008.0,IN,,False,Easy Ship,
4,4,407-1069790-7240320,04-30-22,Shipped,Amazon,Amazon.in,Expedited,JNE3671,JNE3671-TU-XXXL,Top,...,INR,574.0,CHENNAI,TAMIL NADU,600073.0,IN,,False,,


In [290]:
# Check column names of df

df.columns

Index(['index', 'Order ID', 'Date', 'Status', 'Fulfilment', 'Sales Channel ',
       'ship-service-level', 'Style', 'SKU', 'Category', 'Size', 'ASIN',
       'Courier Status', 'Qty', 'currency', 'Amount', 'ship-city',
       'ship-state', 'ship-postal-code', 'ship-country', 'promotion-ids',
       'B2B', 'fulfilled-by', 'Unnamed: 22'],
      dtype='object')

In [291]:
# Check column names of online_sales_df

online_sales_df.columns

Index(['user_id', 'transaction_id', 'date', 'product_id', 'Quantity',
       'Delivery_Charges', 'Coupon_Status', 'Coupon_Code', 'Discount_pct'],
      dtype='object')

## Shipping Status Table

We are interested in the Status, Fulfilment, ship-service-level, Courier Status, B2B and fulfilled-by columns. Hence, we create these columns in `online_sales_df` and drop the columns we are not interested in `online_sales_df`. Additionally, we create shipping_id and estimated_delivery_date columns.

In [292]:
# Drop uninterested columns, create column names of interest in online_sales_df

online_sales_df.drop(columns = ['Quantity',
                                'Delivery_Charges',
                                'Coupon_Status',
                                'Coupon_Code', 
                                'Discount_pct'], inplace = True)

column_names = ['shipping_id', 'status', 'fulfilment', 'ship_service_level', 'estimated_delivery_date', 'courier_status', 'b2b', 'fulfilled_by']

for i in column_names:
    online_sales_df[i] = np.nan

online_sales_df.head()

Unnamed: 0,user_id,transaction_id,date,product_id,shipping_id,status,fulfilment,ship_service_level,estimated_delivery_date,courier_status,b2b,fulfilled_by
0,17850,16679,2019-01-01,B09DL9978Y,,,,,,,,
1,17850,16680,2019-01-01,B09DL9978Y,,,,,,,,
2,17850,16681,2019-01-01,B07GXHC691,,,,,,,,
3,17850,16682,2019-01-01,B08NCKT9FG,,,,,,,,
4,17850,16682,2019-01-01,B08H21B6V7,,,,,,,,


In [293]:
# Generate synthetic shipping_id
online_sales_df['shipping_id'] = range(10001, 10001 + len(online_sales_df))

online_sales_df.head()

Unnamed: 0,user_id,transaction_id,date,product_id,shipping_id,status,fulfilment,ship_service_level,estimated_delivery_date,courier_status,b2b,fulfilled_by
0,17850,16679,2019-01-01,B09DL9978Y,10001,,,,,,,
1,17850,16680,2019-01-01,B09DL9978Y,10002,,,,,,,
2,17850,16681,2019-01-01,B07GXHC691,10003,,,,,,,
3,17850,16682,2019-01-01,B08NCKT9FG,10004,,,,,,,
4,17850,16682,2019-01-01,B08H21B6V7,10005,,,,,,,


Now, we populate the columns with synthetic data based on the values and probabilities we have derived from our original Amazon Sales Report dataset, starting with the status column.

In [294]:
# Check the distinct values in the status column

status_values = df['Status'].unique()

print(status_values)

['Cancelled' 'Shipped - Delivered to Buyer' 'Shipped'
 'Shipped - Returned to Seller' 'Shipped - Rejected by Buyer'
 'Shipped - Lost in Transit' 'Shipped - Out for Delivery'
 'Shipped - Returning to Seller' 'Shipped - Picked Up' 'Pending'
 'Pending - Waiting for Pick Up' 'Shipped - Damaged' 'Shipping']


In [295]:
# Check the relationship between status and courier status

for i in status_values:
    print(df[df['Status']== i]['Courier Status'].unique()) 

[nan 'Cancelled' 'Unshipped']
['Shipped' nan]
['Shipped' 'Unshipped' 'Cancelled']
['Shipped' nan]
['Shipped']
['Shipped']
['Shipped']
['Shipped']
['Shipped']
['Unshipped' 'Cancelled' 'Shipped']
['Unshipped']
['Shipped']
['Unshipped']


We note that there are NA values for courier status and hence we replace the NA values with suitable statements.

In [296]:
# Fill nan entries with suitable data

df.loc[(df['Status'] == 'Cancelled') & (df['Courier Status'].isna()), 'Courier Status'] = 'Courier not booked'
df.loc[(df['Status'] == 'Shipped - Delivered to Buyer') & (df['Courier Status'].isna()), 'Courier Status'] = 'Shipped'
df.loc[(df['Status'] == 'Shipped - Returned to Seller') & (df['Courier Status'].isna()), 'Courier Status'] = 'Shipped'
df.loc[(df['Status'] == 'Shipping') & (df['Courier Status'].isna()), 'Courier Status'] = 'Shipped'

for i in status_values:
    print(df[df['Status']== i]['Courier Status'].unique()) 


['Courier not booked' 'Cancelled' 'Unshipped']
['Shipped']
['Shipped' 'Unshipped' 'Cancelled']
['Shipped']
['Shipped']
['Shipped']
['Shipped']
['Shipped']
['Shipped']
['Unshipped' 'Cancelled' 'Shipped']
['Unshipped']
['Shipped']
['Unshipped']


In [297]:
# Tabulate the values and constraints of df for synthetic data

values = []
prob = []

data_generation_names = ["Status", "Fulfilment", "ship-service-level", "Courier Status", "B2B"]

for i in data_generation_names:
    values.append(df[i].value_counts(normalize=True).index.tolist())
    prob.append(df[i].value_counts(normalize=True).values)

In [298]:
# Generate synthetic data for status column

online_sales_df['status'] = np.random.choice(values[0], size = len(online_sales_df), p = prob[0])



In [299]:
# Check the unique entries in the status column

online_sales_df['status'].unique()

array(['Shipped - Delivered to Buyer', 'Shipped', 'Cancelled',
       'Shipped - Returned to Seller', 'Shipped - Picked Up',
       'Pending - Waiting for Pick Up', 'Pending',
       'Shipped - Returning to Seller', 'Shipped - Rejected by Buyer',
       'Shipping', 'Shipped - Out for Delivery',
       'Shipped - Lost in Transit'], dtype=object)

In [300]:
# Synthesise data for courier_status column

courier_status_cancelled = ["Courier not booked", "Cancelled", "Unshipped"]
courier_status_shipped = ["Shipped", "Unshipped", "Cancelled"]
courier_status_pending = ['Unshipped', 'Cancelled', 'Shipped']
courier_status_waiting_shipping = ['Unshipped']



conditions = [(online_sales_df["status"] == "Cancelled"), 
              (online_sales_df["status"] == "Shipped"),
              (online_sales_df["status"] == "Pending"),
              (online_sales_df["status"] == "Pending - Waiting for Pick Up") |  (online_sales_df["status"] == "Shipping")
              ]
choices = [np.random.choice(courier_status_cancelled, size = len(online_sales_df)), 
           np.random.choice(courier_status_shipped, size = len(online_sales_df)),
           np.random.choice(courier_status_pending, size = len(online_sales_df)),
           np.random.choice(courier_status_waiting_shipping, size = len(online_sales_df))
           ]

online_sales_df["courier_status"] = (
    np.select(
        conditions,
        choices,
        default = "Shipped"
    )
)

online_sales_df.head()

Unnamed: 0,user_id,transaction_id,date,product_id,shipping_id,status,fulfilment,ship_service_level,estimated_delivery_date,courier_status,b2b,fulfilled_by
0,17850,16679,2019-01-01,B09DL9978Y,10001,Shipped - Delivered to Buyer,,,,Shipped,,
1,17850,16680,2019-01-01,B09DL9978Y,10002,Shipped,,,,Shipped,,
2,17850,16681,2019-01-01,B07GXHC691,10003,Shipped,,,,Shipped,,
3,17850,16682,2019-01-01,B08NCKT9FG,10004,Cancelled,,,,Unshipped,,
4,17850,16682,2019-01-01,B08H21B6V7,10005,Shipped,,,,Cancelled,,


Here, we want to check if there is a relationship between the fulfilment and the ship_service_level columns. We note that if fulfilment == 'Merchant', then ship_service_level will only be 'Standard'.

In [301]:
# Check relationship between fulfilment and ship_service_level

df["ship-service-level"][df["Fulfilment"] == "Merchant"].unique()

array(['Standard'], dtype=object)

In [302]:
# Generate synthetic data based on the constraints tabulated for fulfilment, ship_service_level and b2b

online_sales_df["fulfilment"] = np.random.choice(values[1], size = len(online_sales_df), p = prob[1])
online_sales_df["ship_service_level"] = np.random.choice(values[2], size = len(online_sales_df), p = prob[2])
online_sales_df["b2b"] = np.random.choice(values[4], size = len(online_sales_df), p = prob[4])


# Handle the case when fulfilment == "Merchant"

online_sales_df.loc[online_sales_df['fulfilment'] == 'Merchant', 'ship_service_level'] = 'Standard'

online_sales_df.head()


Unnamed: 0,user_id,transaction_id,date,product_id,shipping_id,status,fulfilment,ship_service_level,estimated_delivery_date,courier_status,b2b,fulfilled_by
0,17850,16679,2019-01-01,B09DL9978Y,10001,Shipped - Delivered to Buyer,Amazon,Standard,,Shipped,False,
1,17850,16680,2019-01-01,B09DL9978Y,10002,Shipped,Amazon,Expedited,,Shipped,False,
2,17850,16681,2019-01-01,B07GXHC691,10003,Shipped,Merchant,Standard,,Shipped,False,
3,17850,16682,2019-01-01,B08NCKT9FG,10004,Cancelled,Amazon,Standard,,Unshipped,False,
4,17850,16682,2019-01-01,B08H21B6V7,10005,Shipped,Amazon,Expedited,,Cancelled,False,


The data in the `fulfilled_by` column is dependent on the value of the entry in the `fulfilment` and `b2b` column, and hence to generate synthetic data for the `fulfilled_by` column, we consider the entry in the `fulfilment` and `b2b` columns. The top 3rd party e-commerce fulfilment companies in India include: Quickshift Fulfillment, Shiprocket Fulfillment, Prozo and DHL. We include the company featured in the original dataset, Easy Ship. Prozo and DHL are companies that offer services for B2B business transactions.

In [303]:
# Generate synthetic data for fulfilled_by column

fulfilled_by = ["Easy Ship", "Quickshift Fulfillment", "Shiprocket Fulfillment"]
fulfilled_by_b2b = ["Prozo", "DHL"]

conditions = [(online_sales_df["fulfilment"] == "Merchant") & (online_sales_df["b2b"] == False), 
              (online_sales_df["fulfilment"] == "Merchant") & (online_sales_df["b2b"] == True)]
choices = [np.random.choice(fulfilled_by, size = len(online_sales_df)), np.random.choice(fulfilled_by_b2b, size = len(online_sales_df))]

online_sales_df["fulfilled_by"] = (
    np.select(
        conditions,
        choices,
        default = ""
    )
)

online_sales_df[online_sales_df["fulfilment"] == "Merchant"].head()

Unnamed: 0,user_id,transaction_id,date,product_id,shipping_id,status,fulfilment,ship_service_level,estimated_delivery_date,courier_status,b2b,fulfilled_by
2,17850,16681,2019-01-01,B07GXHC691,10003,Shipped,Merchant,Standard,,Shipped,False,Quickshift Fulfillment
7,17850,16682,2019-01-01,B09Y5FZK9N,10008,Shipped,Merchant,Standard,,Shipped,False,Shiprocket Fulfillment
9,13047,16682,2019-01-01,B08XMG618K,10010,Shipped,Merchant,Standard,,Unshipped,False,Shiprocket Fulfillment
11,13047,16682,2019-01-01,B07GXHC691,10012,Shipped,Merchant,Standard,,Shipped,False,Easy Ship
14,13047,16684,2019-01-01,B00Y4ORQ46,10015,Shipped - Delivered to Buyer,Merchant,Standard,,Shipped,False,Easy Ship


Now we wish to synthesise data for the estimated_delivery_date column. We gathered from sources online the delivery date estimations that Amazon gives to customers as parameters for our synthetic data generation.

In [304]:
# Generate data for estimated_delivery_date
online_sales_df['date'] = pd.to_datetime(online_sales_df['date'])

def generate_estimated_delivery_date(row):
    if row['fulfilment'] == 'Amazon':
        if row['ship_service_level'] == 'Expedited':
            return row['date'] + timedelta(days=np.random.randint(2, 5))  # 2 to 4 days
        elif row['ship_service_level'] == 'Standard':
            return row['date'] + timedelta(days=np.random.randint(5, 9))  # 5 to 8 days

    if row['fulfilment'] == 'Merchant':
        return row['date'] + timedelta(days=np.random.randint(5, 9))
        

online_sales_df['estimated_delivery_date'] = online_sales_df.apply(generate_estimated_delivery_date, axis=1)

online_sales_df.head()

Unnamed: 0,user_id,transaction_id,date,product_id,shipping_id,status,fulfilment,ship_service_level,estimated_delivery_date,courier_status,b2b,fulfilled_by
0,17850,16679,2019-01-01,B09DL9978Y,10001,Shipped - Delivered to Buyer,Amazon,Standard,2019-01-06,Shipped,False,
1,17850,16680,2019-01-01,B09DL9978Y,10002,Shipped,Amazon,Expedited,2019-01-03,Shipped,False,
2,17850,16681,2019-01-01,B07GXHC691,10003,Shipped,Merchant,Standard,2019-01-07,Shipped,False,Quickshift Fulfillment
3,17850,16682,2019-01-01,B08NCKT9FG,10004,Cancelled,Amazon,Standard,2019-01-06,Unshipped,False,
4,17850,16682,2019-01-01,B08H21B6V7,10005,Shipped,Amazon,Expedited,2019-01-04,Cancelled,False,


In [305]:
# Save the resulting df 

online_sales_df.to_csv('../shipping_status.csv', index = False)

## Shipping History table

In [306]:
# Load datasets
shipping_history_df = online_sales_df.drop(columns = 
                                           ['user_id',
                                            'transaction_id',
                                            'product_id',
                                            'fulfilment',
                                            'ship_service_level',
                                            'courier_status',
                                            'b2b'])

shipping_history_df.head()

Unnamed: 0,date,shipping_id,status,estimated_delivery_date,fulfilled_by
0,2019-01-01,10001,Shipped - Delivered to Buyer,2019-01-06,
1,2019-01-01,10002,Shipped,2019-01-03,
2,2019-01-01,10003,Shipped,2019-01-07,Quickshift Fulfillment
3,2019-01-01,10004,Cancelled,2019-01-06,
4,2019-01-01,10005,Shipped,2019-01-04,


In [307]:
# Create change_date which is when status was updated
shipping_history_df["change_date"] = np.nan

shipping_history_df["change_date"] = pd.to_datetime(shipping_history_df["change_date"])

shipping_history_df.head()

Unnamed: 0,date,shipping_id,status,estimated_delivery_date,fulfilled_by,change_date
0,2019-01-01,10001,Shipped - Delivered to Buyer,2019-01-06,,NaT
1,2019-01-01,10002,Shipped,2019-01-03,,NaT
2,2019-01-01,10003,Shipped,2019-01-07,Quickshift Fulfillment,NaT
3,2019-01-01,10004,Cancelled,2019-01-06,,NaT
4,2019-01-01,10005,Shipped,2019-01-04,,NaT


In [308]:
shipping_history_df.dtypes

date                       datetime64[ns]
shipping_id                         int64
status                             object
estimated_delivery_date    datetime64[ns]
fulfilled_by                       object
change_date                datetime64[ns]
dtype: object

In [309]:
# Define constraints based on 'status' and synthesise data for change_date column
conditions = [
    (shipping_history_df['status'] == 'Shipped - Delivered to Buyer') & (shipping_history_df['fulfilled_by'].isna()),
    (shipping_history_df['status'] == 'Shipped - Delivered to Buyer') & (shipping_history_df['fulfilled_by'] == 'Quickshift Fulfillment'),
    (shipping_history_df['status'] == 'Shipped - Delivered to Buyer') & (~shipping_history_df['fulfilled_by'].isna()) & (shipping_history_df['fulfilled_by'] != 'Quickshift Fulfillment'),
    (shipping_history_df['status'] == 'Shipped - Returned to Seller'),
    (shipping_history_df['status'] == 'Shipped') | (shipping_history_df['status'] == 'Cancelled'),
    (shipping_history_df['status'] == 'Shipped - Picked Up'),
    (shipping_history_df['status'] == 'Pending'),
    (shipping_history_df['status'] == 'Pending - Waiting for Pick Up'),
    (shipping_history_df['status'] == 'Shipped - Out for Delivery'),
    (shipping_history_df['status'] == 'Shipped - Returning to Seller'),
    (shipping_history_df['status'] == 'Shipping'),
    (shipping_history_df['status'] == 'Shipped - Rejected by Buyer')
]


choices = [
    shipping_history_df['estimated_delivery_date'] + pd.to_timedelta(np.random.randint(-2, 3, size=len(shipping_history_df)), unit='D'),
    shipping_history_df['estimated_delivery_date'] + pd.to_timedelta(np.random.randint(-2, 6, size=len(shipping_history_df)), unit='D'),
    shipping_history_df['estimated_delivery_date'] + pd.to_timedelta(np.random.randint(-3, 4, size=len(shipping_history_df)), unit='D'),
    shipping_history_df['estimated_delivery_date'] + pd.to_timedelta(np.random.randint(3, 9, size=len(shipping_history_df)), unit='D'),
    shipping_history_df['date'] + pd.to_timedelta(np.random.randint(1, 5, size=len(shipping_history_df)), unit='D'),
    shipping_history_df['estimated_delivery_date'] + pd.to_timedelta(np.random.randint(-2, 2, size=len(shipping_history_df)), unit='D'),
    shipping_history_df['date'] + pd.to_timedelta(np.random.randint(1, 3, size=len(shipping_history_df)), unit='D'),
    shipping_history_df['date'] + pd.to_timedelta(np.random.randint(1, 4, size=len(shipping_history_df)), unit='D'),
    shipping_history_df['estimated_delivery_date'] + pd.to_timedelta(np.random.randint(-2, 3, size=len(shipping_history_df)), unit='D'),
    shipping_history_df['estimated_delivery_date'] + pd.to_timedelta(np.random.randint(2, 5, size=len(shipping_history_df)), unit='D'),
    shipping_history_df['date'] + pd.to_timedelta(np.random.randint(2, 5, size=len(shipping_history_df)), unit='D'),
    shipping_history_df['estimated_delivery_date'] + pd.to_timedelta(np.random.randint(-3, 4, size=len(shipping_history_df)), unit='D')
]


shipping_history_df['change_date'] = np.select(conditions, choices, default = None)

shipping_history_df['change_date'] = pd.to_datetime(shipping_history_df["change_date"])

shipping_history_df.head()

Unnamed: 0,date,shipping_id,status,estimated_delivery_date,fulfilled_by,change_date
0,2019-01-01,10001,Shipped - Delivered to Buyer,2019-01-06,,2019-01-03
1,2019-01-01,10002,Shipped,2019-01-03,,2019-01-02
2,2019-01-01,10003,Shipped,2019-01-07,Quickshift Fulfillment,2019-01-04
3,2019-01-01,10004,Cancelled,2019-01-06,,2019-01-04
4,2019-01-01,10005,Shipped,2019-01-04,,2019-01-02


In [310]:
# Remove unnecessary columns with repeated information from other df

shipping_history_df.drop(columns = ['fulfilled_by',
                                    'estimated_delivery_date'], inplace = True)

shipping_history_df.head()

Unnamed: 0,date,shipping_id,status,change_date
0,2019-01-01,10001,Shipped - Delivered to Buyer,2019-01-03
1,2019-01-01,10002,Shipped,2019-01-02
2,2019-01-01,10003,Shipped,2019-01-04
3,2019-01-01,10004,Cancelled,2019-01-04
4,2019-01-01,10005,Shipped,2019-01-02


In [311]:
# Save the resulting df

shipping_history_df.to_csv('../shipping_history.csv', index = False)

## Addition of origin country for Products table

To analyse supplier performance, we create a new column in `products.csv`, origin_area.

We want to generate synthetic data for the two created columns. We will use another dataset containing information about the suppliers for products sold on Amazon to obtain the trend and constraints for origin_area, which will contain information on both the city and country of origin.

In [312]:
# Load datasets
products_df = pd.read_csv('../products.csv')
suppliers_df = pd.read_csv('../source/Amazon Supplier List.csv')

suppliers_df.head()

Unnamed: 0,SITE,ADDRESS,CITY,STATE/REGION,COUNTRY
0,3Q Vina,"8 An Duong Vuong Street, Ward 16, District 8",Hồ Chí Minh,Hồ Chí Minh City,Vietnam
1,A Mount Inc.,"No. 65, Dongshing Street, Shulin District",New Taipei City,Taipei,Taiwan
2,A. R. Industries,"Mauza Rampur Jattan, Dhakwala Moginand, Tehsil...",Kala Amb,Himachal Pradesh,India
3,AAC Technologies Holdings Inc.\n(Shenzhen),"No.1, Chengxin Road, Baolong Ind. Park, Longga...",Shenzhen,Guangdong,China
4,"Ability Opto-Electronics Technology Co., Ltd.","4F. No.31, Keya Rd., Daya Dist.",Taichung City,Taichung,Taiwan


We filter the sites to ensure we only consider cities and countries that export electronics through Amazon.

In [313]:
# Filter companies to show only tech sites

keywords = ["elec", "tech"]
pattern = '|'.join(keywords)

filtered_supplier_df = suppliers_df[suppliers_df['SITE'].str.contains(pattern, case = False, na = False)]

filtered_supplier_df.head()

Unnamed: 0,SITE,ADDRESS,CITY,STATE/REGION,COUNTRY
3,AAC Technologies Holdings Inc.\n(Shenzhen),"No.1, Chengxin Road, Baolong Ind. Park, Longga...",Shenzhen,Guangdong,China
4,"Ability Opto-Electronics Technology Co., Ltd.","4F. No.31, Keya Rd., Daya Dist.",Taichung City,Taichung,Taiwan
6,"AcBel Electronic (Dong Guan) Co., Ltd.","No.17-28, (Hong Yeh Rd.), Hong Yeh Industrial ...",Dongguan,Guangdong,China
10,"Acrox Technologies Co., Ltd","No. 2 Xinmin Road, Xinmin Village, Changan Town",Dongguan,Guangdong,China
30,Amperex Technology Limited,"1 West Industrial Road, North Zone of SongShan...",Dongguan,Guangdong,China


In [314]:
# Check the length of the filtered df

len(filtered_supplier_df)

188

In [315]:
# Join the city and country columns together to form area

filtered_supplier_df['AREA'] = filtered_supplier_df['CITY'].astype(str) + ', ' + filtered_supplier_df['COUNTRY'].astype(str)

filtered_supplier_df.head()

Unnamed: 0,SITE,ADDRESS,CITY,STATE/REGION,COUNTRY,AREA
3,AAC Technologies Holdings Inc.\n(Shenzhen),"No.1, Chengxin Road, Baolong Ind. Park, Longga...",Shenzhen,Guangdong,China,"Shenzhen, China"
4,"Ability Opto-Electronics Technology Co., Ltd.","4F. No.31, Keya Rd., Daya Dist.",Taichung City,Taichung,Taiwan,"Taichung City, Taiwan"
6,"AcBel Electronic (Dong Guan) Co., Ltd.","No.17-28, (Hong Yeh Rd.), Hong Yeh Industrial ...",Dongguan,Guangdong,China,"Dongguan, China"
10,"Acrox Technologies Co., Ltd","No. 2 Xinmin Road, Xinmin Village, Changan Town",Dongguan,Guangdong,China,"Dongguan, China"
30,Amperex Technology Limited,"1 West Industrial Road, North Zone of SongShan...",Dongguan,Guangdong,China,"Dongguan, China"


In [316]:
# Generate synthetic data in online_sales_df based on the counts and constraints of the suppliers df

area_values = filtered_supplier_df['AREA'].value_counts(normalize=True).index.tolist()
area_probs = filtered_supplier_df['AREA'].value_counts(normalize=True).values

products_df["origin_area"] = np.random.choice(area_values, size = len(products_df), p =area_probs)

products_df.head()

Unnamed: 0,product_id,product_name,about_product,category,actual_price,discounted_price,discount_percentage,origin_area
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...,Computers&Accessories|Accessories&Peripherals|...,13.19,4.79,0.64,"Suzhou, China"
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,"Compatible with all Type C enabled devices, be...",Computers&Accessories|Accessories&Peripherals|...,4.19,2.39,0.43,"Nomi-shi, Japan"
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,【 Fast Charger& Data Sync】-With built-in safet...,Computers&Accessories|Accessories&Peripherals|...,22.79,2.39,0.9,"Shenzhen, China"
3,B08HDJ86NZ,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,The boAt Deuce USB 300 2 in 1 cable is compati...,Computers&Accessories|Accessories&Peripherals|...,8.39,3.95,0.53,"Hangzhou, China"
4,B08CF3B7N1,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,[CHARGE & SYNC FUNCTION]- This cable comes wit...,Computers&Accessories|Accessories&Peripherals|...,4.79,1.85,0.61,"Shenzhen, China"


In [317]:
# Save the resulting df

products_df.to_csv('../products.csv', index = False)