In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/raw/data.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [14]:


df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

print("ðŸ“… Min date:", df['InvoiceDate'].min())
print("ðŸ“… Max date:", df['InvoiceDate'].max())

ðŸ“… Min date: 2010-12-01 08:26:00
ðŸ“… Max date: 2011-12-09 12:50:00


In [3]:
df.shape

(541909, 8)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


#Summary Statistics

In [5]:
df.describe(include='all')

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
count,541909.0,541909,540455,541909.0,541909,541909.0,406829.0,541909
unique,25900.0,4070,4223,,23260,,,38
top,573585.0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,,10/31/2011 14:41,,,United Kingdom
freq,1114.0,2313,2369,,1114,,,495478
mean,,,,9.55225,,4.611114,15287.69057,
std,,,,218.081158,,96.759853,1713.600303,
min,,,,-80995.0,,-11062.06,12346.0,
25%,,,,1.0,,1.25,13953.0,
50%,,,,3.0,,2.08,15152.0,
75%,,,,10.0,,4.13,16791.0,


#Check Missing Values

In [6]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

#Check for Duplicates

In [7]:
df.duplicated().sum()

np.int64(5268)

In [8]:
df = df.drop_duplicates()

#Check Unique Values (Quick Sanity Check)

In [9]:
df.nunique()

InvoiceNo      25900
StockCode       4070
Description     4223
Quantity         722
InvoiceDate    23260
UnitPrice       1630
CustomerID      4372
Country           38
dtype: int64

#Data Cleaning Steps

In [10]:
# ---- Step 1: Remove canceled transactions ----
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

# ---- Step 2: Drop missing Customer IDs ----
df = df.dropna(subset=['CustomerID'])

# ---- Step 3: Remove negative or zero quantities and prices ----
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]

# ---- Step 4: Convert data types ----
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['CustomerID'] = df['CustomerID'].astype(int)

# ---- Step 5: Handle missing descriptions (optional) ----
df['Description'] = df['Description'].fillna('Unknown Product')

# ---- Step 6: Add TotalPrice column ----
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# ---- Step 7: Create Month column (for time-based grouping) ----
df['InvoiceMonth'] = df['InvoiceDate'].dt.to_period('M')

# ---- Step 8: Remove duplicates ----
df = df.drop_duplicates()

# ---- Step 9: Reset index ----
df = df.reset_index(drop=True)

# ---- Step 10: Save cleaned data ----
df.to_csv("data/cleaned/ecommerce_cleaned.csv", index=False)

In [11]:
df.info()
df.isnull().sum()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392692 entries, 0 to 392691
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   InvoiceNo     392692 non-null  object        
 1   StockCode     392692 non-null  object        
 2   Description   392692 non-null  object        
 3   Quantity      392692 non-null  int64         
 4   InvoiceDate   392692 non-null  datetime64[ns]
 5   UnitPrice     392692 non-null  float64       
 6   CustomerID    392692 non-null  int64         
 7   Country       392692 non-null  object        
 8   TotalPrice    392692 non-null  float64       
 9   InvoiceMonth  392692 non-null  period[M]     
dtypes: datetime64[ns](1), float64(2), int64(2), object(4), period[M](1)
memory usage: 30.0+ MB


Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID,TotalPrice
count,392692.0,392692,392692.0,392692.0,392692.0
mean,13.119702,2011-07-10 19:13:07.771892480,3.125914,15287.843865,22.6315
min,1.0,2010-12-01 08:26:00,0.001,12346.0,0.001
25%,2.0,2011-04-07 11:12:00,1.25,13955.0,4.95
50%,6.0,2011-07-31 12:02:00,1.95,15150.0,12.45
75%,12.0,2011-10-20 12:53:00,3.75,16791.0,19.8
max,80995.0,2011-12-09 12:50:00,8142.75,18287.0,168469.6
std,180.492832,,22.241836,1713.539549,311.099224


## Save cleaned data to CSV

In [12]:
# Save cleaned data to CSV
df.to_csv("data/cleaned/ecommerce_cleaned.csv", index=False)

print("âœ… Cleaned dataset saved successfully to data/cleaned/ecommerce_cleaned.csv")

âœ… Cleaned dataset saved successfully to data/cleaned/ecommerce_cleaned.csv


In [13]:
pd.read_csv("data/cleaned/ecommerce_cleaned.csv").head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,InvoiceMonth
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,2010-12
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010-12
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0,2010-12
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010-12
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010-12
