In [37]:
import pandas as pd
import os

#-------------------------
# 1 Load raw dataset
#-------------------------
df = pd.read_csv("data/train.csv")

#-------------------------
# 2 Handle missing values
#-------------------------
# Drop raws with missing postal code
df = df.dropna(subset=['Postal Code'])

#-------------------------
# 3 Clean column names
#-------------------------
df.columns = (
    df.columns
        .str.strip()
        .str.lower()
        .str.replace(' ', '_')
        .str.replace('-', '_')
)
#-------------------------
# 4 Convert date columns to datetime
#-------------------------
df['order_date'] = pd.to_datetime(df['order_date'], format='%d/%m/%Y')
df['ship_date']  = pd.to_datetime(df['ship_date'], format='%d/%m/%Y')

#-------------------------
# 5 Create KPI/ Feature columns
#-------------------------
df['year'] = df['order_date'].dt.year
df['month'] = df['order_date'].dt.month
df['month_name'] = df['order_date'].dt.month_name()
df['shipping_days'] = (df['ship_date'] - df['order_date']).dt.days

#-------------------------
# 6 Optional: remove duplicates
#-------------------------
df = df.drop_duplicates()

#-------------------------
# 7 Save cleaned dataset
#-------------------------
# Create 'data' folder if it doesn't exist
if not os.path.exists("data"):
    os.makedirs("data")

#Save to CSV
df.to_csv("data/clean_sales_data.csv", index = False)

#-------------------------
# 8 Check your cleaned dataset
#-------------------------
print("Cleaned dataset saved successfully!")
print("Shape:", df.shape)
print(df.head())
print(df.isnull().sum())
print(df.columns)

Cleaned dataset saved successfully!
Shape: (9789, 22)
   row_id        order_id order_date  ship_date       ship_mode customer_id  \
0       1  CA-2017-152156 2017-11-08 2017-11-11    Second Class    CG-12520   
1       2  CA-2017-152156 2017-11-08 2017-11-11    Second Class    CG-12520   
2       3  CA-2017-138688 2017-06-12 2017-06-16    Second Class    DV-13045   
3       4  US-2016-108966 2016-10-11 2016-10-18  Standard Class    SO-20335   
4       5  US-2016-108966 2016-10-11 2016-10-18  Standard Class    SO-20335   

     customer_name    segment        country             city  ... region  \
0      Claire Gute   Consumer  United States        Henderson  ...  South   
1      Claire Gute   Consumer  United States        Henderson  ...  South   
2  Darrin Van Huff  Corporate  United States      Los Angeles  ...   West   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...  South   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...  South   

        