In [5]:
import pandas as pd
import os
import warnings

pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings("ignore", message="Workbook contains no default style, apply openpyxl's default")

shop = 'Nestle'

# Order Data cannot be extracted from Lazada Seller Center in one go, so we had to break it down into multiple excel files
# All excel files from the same shop are stored in one folder
folder = f'Lazada Seller Center/{shop}'
excel_files = os.listdir(folder)

# Loop over all the excel files in the folder and import only the "orderNumber" and "status" columns
dfs = {}
for file in excel_files:
    file_path = os.path.join(folder, file)
    df = pd.read_excel(file_path, usecols=["orderNumber", "status", "createTime"])
    dfs[file] = df

# The result will be one DataFrame with "orderNumber" and "status" columns for each excel file
for file, df in dfs.items():
    print(file)
    print("Number of rows:", len(df))
    print(df.head(),'\n')

Lazada Nestle (April 5 - April 6).xlsx
Number of rows: 35452
          createTime      orderNumber     status
0  06 Apr 2024 23:59  815632621032626  delivered
1  06 Apr 2024 23:59  815632621032626  delivered
2  06 Apr 2024 23:59  815632621032626  delivered
3  06 Apr 2024 23:58  815615071012948  delivered
4  06 Apr 2024 23:58  821590341526793  confirmed 

Lazada Nestle (March 16- March 21).xlsx
Number of rows: 92556
          createTime      orderNumber    status
0  21 Mar 2024 23:59  813018543764322  canceled
1  21 Mar 2024 23:59  813018543764322  canceled
2  21 Mar 2024 23:59  813018543764322  canceled
3  21 Mar 2024 23:59  813018543764322  canceled
4  21 Mar 2024 23:59  813018543764322  canceled 

Lazada Nestle (March 22 - March 24).xlsx
Number of rows: 71663
          createTime      orderNumber            status
0  24 Mar 2024 23:59  814571149431252  Package Returned
1  24 Mar 2024 23:59  814571149431252  Package Returned
2  24 Mar 2024 23:59  814571149431252          canceled
3  2

In [6]:
# Merge into one DataFrame
sc_raw = pd.concat(dfs.values(), keys=dfs.keys(), ignore_index=True)
count_sc_raw = len(sc_raw)

# Remove duplicate entries
sc_unique = sc_raw.drop_duplicates(subset=['orderNumber'])
count_sc_unique = len(sc_unique)

print(f"SC Lazada {shop}")
print("Number of rows:", count_sc_raw)
print("Number of unique rows:", count_sc_unique)

SC Lazada Nestle
Number of rows: 461449
Number of unique rows: 137834


In [7]:
# Import Snowflake data into a DataFrame
snowflake_raw = pd.read_csv(f'Snowflake/Snowflake Lazada {shop}.csv', usecols=['PLFM_ORD_ID','ORD_STAT_END', 'ORD_STAT_PLFM', 'ORDER_DATE'])
count_snowflake_raw = len(snowflake_raw)

# Determine date range of the dataset
date_range = pd.date_range(start=snowflake_raw.loc[:, 'ORDER_DATE'].min(), end=snowflake_raw.loc[:, 'ORDER_DATE'].max())
sf_start_date = date_range[0].date()
sf_end_date = date_range[-1].date()

# Remove duplicates
snowflake_unique = snowflake_raw.drop_duplicates(subset=['PLFM_ORD_ID'], ignore_index=True)
count_snowflake_unique = len(snowflake_unique)

# Get non-end status rows
end_status = ['delivered','cancelled','returned']

# The ~ negates the condition 'merged_unique['ORD_STAT_END'].isin(end_status)'
snowflake_non_end_status = snowflake_unique[~snowflake_unique['ORD_STAT_END'].isin(end_status)]
count_snowflake_non_end_status = len(snowflake_non_end_status)

print(f"Snowflake Lazada {shop} ({sf_start_date} - {sf_end_date})")
print("Number of rows:", count_snowflake_raw)
print("Number of unique rows:", count_snowflake_unique)
print("Number of non-end status rows:", count_snowflake_non_end_status)

Snowflake Lazada Nestle (2024-03-16 - 2024-04-06)
Number of rows: 529104
Number of unique rows: 152828
Number of non-end status rows: 17303


In [8]:
# Merge the Seller Center and Snowflake DataFrames so we could compare them
merged_non_end_status = pd.merge(sc_unique, snowflake_non_end_status, left_on="orderNumber", right_on="PLFM_ORD_ID", how="inner")
count_merged_non_end_status = len(merged_non_end_status)
total = count_snowflake_unique

# Count the number of mismatched status
mismatched_status = merged_non_end_status[merged_non_end_status["status"] != merged_non_end_status["ORD_STAT_PLFM"]]
count_mismatched_status = len(mismatched_status)

# Calculate percent mismatched status
percent_mismatched_status = (count_mismatched_status / total) * 100

print(f"Lazada {shop} SC vs Snowflake ({sf_start_date} - {sf_end_date})")
print(mismatched_status.head(10), '\n')
print("Number of rows:", total)
print("Number of mismatched non-end status:", count_mismatched_status)
print("Percent mismatched non-end status:", round(percent_mismatched_status, 2), "%")


Lazada Nestle SC vs Snowflake (2024-03-16 - 2024-04-06)
           createTime      orderNumber     status      PLFM_ORD_ID ORD_STAT_END ORD_STAT_PLFM  ORDER_DATE
59  05 Apr 2024 17:26  820935115096212  delivered  820935115096212      shipped       shipped  2024-04-05
61  05 Apr 2024 17:04  820933101652844  confirmed  820933101652844      shipped       shipped  2024-04-05
63  05 Apr 2024 16:31  820898547511615  confirmed  820898547511615      shipped       shipped  2024-04-05
64  05 Apr 2024 16:29  814927058216731  delivered  814927058216731      shipped       shipped  2024-04-05
81  05 Apr 2024 02:28  820597118848373  delivered  820597118848373      shipped       shipped  2024-04-05
82  05 Apr 2024 02:03  814613098890262  delivered  814613098890262      shipped       shipped  2024-04-05
83  05 Apr 2024 00:55  814632823164358  delivered  814632823164358      shipped       shipped  2024-04-05
84  05 Apr 2024 00:54  814621055412130  confirmed  814621055412130      shipped       shipped  2