In [4]:
# 01_data_cleaning.ipynb

# 📦 Import Libraries
import pandas as pd
import numpy as np

# 📂 Load Dataset
file_path = "C:/Users/Vaish/OneDrive/Desktop/internship/Ultimez_ConsumerInsightsDashboard/data/raw_data.csv"
data = pd.read_csv(file_path, encoding="ISO-8859-1")
print("✅ Raw data loaded successfully!")

# 👀 Preview Data
data.head()

# 🔎 Check Data Info
data.info()

# 🧹 Step 1: Drop rows with missing CustomerID
data = data.dropna(subset=['CustomerID'])
print("✅ Dropped rows with missing CustomerID")

# 🧹 Step 2: Remove negative Quantity (Returns)
data = data[data['Quantity'] > 0]
print("✅ Removed negative quantities (returns)")

# ➕ Step 3: Add TotalPrice column
data['TotalPrice'] = data['Quantity'] * data['UnitPrice']
print("✅ Added TotalPrice column")

# 🗓️ Step 4: Convert InvoiceDate to datetime
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
print("✅ Converted InvoiceDate to datetime format")

# 🔁 Step 5: Drop duplicates
data = data.drop_duplicates()
print("✅ Dropped duplicate rows")

import os

# Create folder if it doesn't exist
output_folder = "../data"
os.makedirs(output_folder, exist_ok=True)

# Save file
cleaned_file_path = os.path.join(output_folder, "cleaned_data.csv")
data.to_csv(cleaned_file_path, index=False)
print(f"✅ Cleaned data saved to {cleaned_file_path}")

# 💾 Save Cleaned Data
cleaned_file_path = "cleaned_data.csv"
data.to_csv(cleaned_file_path, index=False)
print(f"✅ Cleaned data saved to {cleaned_file_path}")

# 📊 Preview Cleaned Data
data.head()


✅ Raw data loaded successfully!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB
✅ Dropped rows with missing CustomerID
✅ Removed negative quantities (returns)
✅ Added TotalPrice column
✅ Converted InvoiceDate to datetime format
✅ Dropped duplicate rows
✅ Cleaned data saved to ../data\cleaned_data.csv
✅ Cleaned data saved to cleaned_data.csv


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
