## Import Packages

In [5]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import numpy as np

## Load Data

In [6]:
# Set the path to the file you'd like to load
file_path = "amazon_sales_data 2025.csv"

# Load the latest version
df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "zahidmughal2343/amazon-sales-2025",
  file_path,
)

## Data Preprocessing

In [7]:
df

Unnamed: 0,Order ID,Date,Product,Category,Price,Quantity,Total Sales,Customer Name,Customer Location,Payment Method,Status
0,ORD0001,14-03-25,Running Shoes,Footwear,60,3,180,Emma Clark,New York,Debit Card,Cancelled
1,ORD0002,20-03-25,Headphones,Electronics,100,4,400,Emily Johnson,San Francisco,Debit Card,Pending
2,ORD0003,15-02-25,Running Shoes,Footwear,60,2,120,John Doe,Denver,Amazon Pay,Cancelled
3,ORD0004,19-02-25,Running Shoes,Footwear,60,3,180,Olivia Wilson,Dallas,Credit Card,Pending
4,ORD0005,10-03-25,Smartwatch,Electronics,150,3,450,Emma Clark,New York,Debit Card,Pending
...,...,...,...,...,...,...,...,...,...,...,...
245,ORD0246,17-03-25,T-Shirt,Clothing,20,2,40,Daniel Harris,Miami,Debit Card,Cancelled
246,ORD0247,30-03-25,Jeans,Clothing,40,1,40,Sophia Miller,Dallas,Debit Card,Cancelled
247,ORD0248,05-03-25,T-Shirt,Clothing,20,2,40,Chris White,Denver,Debit Card,Cancelled
248,ORD0249,08-03-25,Smartwatch,Electronics,150,3,450,Emily Johnson,New York,Debit Card,Cancelled


In [8]:
# total sales not required as price and quality are shown
# drop out customer name and location to prevent confusion as we dont know if same customer name = same person (required customer ID if in real scenario )  
df = df.drop(columns=["Total Sales", "Customer Name", "Customer Location"])
df

Unnamed: 0,Order ID,Date,Product,Category,Price,Quantity,Payment Method,Status
0,ORD0001,14-03-25,Running Shoes,Footwear,60,3,Debit Card,Cancelled
1,ORD0002,20-03-25,Headphones,Electronics,100,4,Debit Card,Pending
2,ORD0003,15-02-25,Running Shoes,Footwear,60,2,Amazon Pay,Cancelled
3,ORD0004,19-02-25,Running Shoes,Footwear,60,3,Credit Card,Pending
4,ORD0005,10-03-25,Smartwatch,Electronics,150,3,Debit Card,Pending
...,...,...,...,...,...,...,...,...
245,ORD0246,17-03-25,T-Shirt,Clothing,20,2,Debit Card,Cancelled
246,ORD0247,30-03-25,Jeans,Clothing,40,1,Debit Card,Cancelled
247,ORD0248,05-03-25,T-Shirt,Clothing,20,2,Debit Card,Cancelled
248,ORD0249,08-03-25,Smartwatch,Electronics,150,3,Debit Card,Cancelled


In [9]:
print(df.dtypes)

Order ID          object
Date              object
Product           object
Category          object
Price              int64
Quantity           int64
Payment Method    object
Status            object
dtype: object


In [10]:
def print_unique_col_values(df):
       for column in df:
            if df[column].dtypes=='object':
                print(f'{column}: {df[column].unique()}')

In [11]:
print_unique_col_values(df)

Order ID: ['ORD0001' 'ORD0002' 'ORD0003' 'ORD0004' 'ORD0005' 'ORD0006' 'ORD0007'
 'ORD0008' 'ORD0009' 'ORD0010' 'ORD0011' 'ORD0012' 'ORD0013' 'ORD0014'
 'ORD0015' 'ORD0016' 'ORD0017' 'ORD0018' 'ORD0019' 'ORD0020' 'ORD0021'
 'ORD0022' 'ORD0023' 'ORD0024' 'ORD0025' 'ORD0026' 'ORD0027' 'ORD0028'
 'ORD0029' 'ORD0030' 'ORD0031' 'ORD0032' 'ORD0033' 'ORD0034' 'ORD0035'
 'ORD0036' 'ORD0037' 'ORD0038' 'ORD0039' 'ORD0040' 'ORD0041' 'ORD0042'
 'ORD0043' 'ORD0044' 'ORD0045' 'ORD0046' 'ORD0047' 'ORD0048' 'ORD0049'
 'ORD0050' 'ORD0051' 'ORD0052' 'ORD0053' 'ORD0054' 'ORD0055' 'ORD0056'
 'ORD0057' 'ORD0058' 'ORD0059' 'ORD0060' 'ORD0061' 'ORD0062' 'ORD0063'
 'ORD0064' 'ORD0065' 'ORD0066' 'ORD0067' 'ORD0068' 'ORD0069' 'ORD0070'
 'ORD0071' 'ORD0072' 'ORD0073' 'ORD0074' 'ORD0075' 'ORD0076' 'ORD0077'
 'ORD0078' 'ORD0079' 'ORD0080' 'ORD0081' 'ORD0082' 'ORD0083' 'ORD0084'
 'ORD0085' 'ORD0086' 'ORD0087' 'ORD0088' 'ORD0089' 'ORD0090' 'ORD0091'
 'ORD0092' 'ORD0093' 'ORD0094' 'ORD0095' 'ORD0096' 'ORD0097' 'ORD00

In [12]:
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%y') #%y: 2 digits #%Y: 4 digits

In [13]:
print(df.dtypes)

Order ID                  object
Date              datetime64[ns]
Product                   object
Category                  object
Price                      int64
Quantity                   int64
Payment Method            object
Status                    object
dtype: object


In [14]:
df.head(5)

Unnamed: 0,Order ID,Date,Product,Category,Price,Quantity,Payment Method,Status
0,ORD0001,2025-03-14,Running Shoes,Footwear,60,3,Debit Card,Cancelled
1,ORD0002,2025-03-20,Headphones,Electronics,100,4,Debit Card,Pending
2,ORD0003,2025-02-15,Running Shoes,Footwear,60,2,Amazon Pay,Cancelled
3,ORD0004,2025-02-19,Running Shoes,Footwear,60,3,Credit Card,Pending
4,ORD0005,2025-03-10,Smartwatch,Electronics,150,3,Debit Card,Pending


In [15]:
df.isna().sum()

Order ID          0
Date              0
Product           0
Category          0
Price             0
Quantity          0
Payment Method    0
Status            0
dtype: int64

## Save File

In [16]:
df.to_csv("amazon_sales_data_cleaned.csv", index=False) # index=False prevents writing the DataFrame index to the file