In [37]:
import pandas as pd

In [10]:
df = pd.read_csv("raw_data.csv", encoding="latin1")

In [39]:
print("Original Data Shape:", df.shape)
print("\nBefore Cleaning:")
print(df.head())

Original Data Shape: (365802, 8)

Before Cleaning:
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  


In [30]:
print("\nDuplicates:", df.duplicated().sum())
df = df.drop_duplicates()
print("After Removing Duplicates:", df.shape)


Duplicates: 5268
After Removing Duplicates: (536641, 8)


In [31]:
print("\nMissing Values:")
print(df.isnull().sum())
df['UnitPrice'] = df['UnitPrice'].fillna(df['UnitPrice'].mean())
df = df.dropna(subset=['CustomerID'])
print("\nAfter Handling Missing Values:")
print(df.isnull().sum())


Missing Values:
InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135037
Country             0
dtype: int64

After Handling Missing Values:
InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64


In [32]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')
df['Description'] = df['Description'].str.upper()

In [33]:
q1 = df['UnitPrice'].quantile(0.25)
q3 = df['UnitPrice'].quantile(0.75)
iqr = q3 - q1
df = df[(df['UnitPrice'] >= q1 - 1.5 * iqr) & (df['UnitPrice'] <= q3 + 1.5 * iqr)]

In [34]:
print("\nAfter Cleaning:")
print(df.head())
print("Final Data Shape:", df.shape)


After Cleaning:
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
Final Data Shape: (365802, 8)


In [35]:
df.to_csv('cleaned_data.csv', index=False)