In [None]:
import pandas as pd
import numpy as np

df=pd.read_csv('train.csv')
df.columns

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales'],
      dtype='object')

In [None]:
df.head()


Row ID	Order ID	Order Date	Ship Date	Ship Mode	Customer ID	Customer Name	Segment	Country	City	State	Postal Code	Region	Product ID	Category	Sub-Category	Product Name	Sales
0	1	CA-2017-152156	08/11/2017	11/11/2017	Second Class	CG-12520	Claire Gute	Consumer	United States	Henderson	Kentucky	42420.0	South	FUR-BO-10001798	Furniture	Bookcases	Bush Somerset Collection Bookcase	261.9600
1	2	CA-2017-152156	08/11/2017	11/11/2017	Second Class	CG-12520	Claire Gute	Consumer	United States	Henderson	Kentucky	42420.0	South	FUR-CH-10000454	Furniture	Chairs	Hon Deluxe Fabric Upholstered Stacking Chairs,...	731.9400
2	3	CA-2017-138688	12/06/2017	16/06/2017	Second Class	DV-13045	Darrin Van Huff	Corporate	United States	Los Angeles	California	90036.0	West	OFF-LA-10000240	Office Supplies	Labels	Self-Adhesive Address Labels for Typewriters b...	14.6200
3	4	US-2016-108966	11/10/2016	18/10/2016	Standard Class	SO-20335	Sean O'Donnell	Consumer	United States	Fort Lauderdale	Florida	33311.0	South	FUR-TA-10000577	Furniture	Tables	Bretford CR4500 Series Slim Rectangular Table	957.5775
4	5	US-2016-108966	11/10/2016	18/10/2016	Standard Class	SO-20335	Sean O'Donnell	Consumer	United States	Fort Lauderdale	Florida	33311.0	South	OFF-ST-10000760	Office Supplies	Storage	Eldon Fold 'N Roll Cart System	22.3680


In [None]:
df.dtypes

Row ID             int64
Order ID          object
Order Date        object
Ship Date         object
Ship Mode         object
Customer ID       object
Customer Name     object
Segment           object
Country           object
City              object
State             object
Postal Code      float64
Region            object
Product ID        object
Category          object
Sub-Category      object
Product Name      object
Sales            float64
dtype: object

In [None]:
# Convert date columns to datetime
df['Order Date']=pd.to_datetime(df['Order Date'], dayfirst=True)
df['Ship Date']=pd.to_datetime(df['Ship Date'], dayfirst=True)

In [None]:
# Fix Postal Code (float → string)
df['Postal Code'] = pd.to_numeric(df['Postal Code'], errors='coerce')
df['Postal Code']=df['Postal Code'].astype("Int64").astype(str)

In [None]:
# Trim spaces in text columns
text_cols = df.select_dtypes(include='object').columns
df[text_cols]=df[text_cols].apply(lambda x:x.str.strip())

In [None]:
# Remove duplicates (if any)
df=df.drop_duplicates()

In [None]:
# Handle missing values (example)
df['Sales']=df['Sales'].fillna(0)

Feature Engineering

In [None]:
# Shipping Days
df['Shipping Days']=(df['Ship Date']-df['Order Date']).dt.days

# Order Year, Month, Day
df['Order Year']=df['Order Date'].dt.year
df['Order Month']=df['Order Date'].dt.month
df['Order Month Name']=df['Order Date'].dt.month_name()

In [None]:
# Sales Category
df['Sales Category']=pd.cut(df['Sales'],
                bins=[0,100,500,1000,5000],
                labels=['Low','Medium', 'High','Very High'])

# Is Same Day Shipping
df['Fast Shipping']=df['Shipping Days'].apply(lambda x:"Yes" if x<=2 else "No")

# Customer Full Info
df['Customer Info']=df['Customer Name']+'('+df['Segment']+')'

In [None]:
df.head()


Row ID	Order ID	Order Date	Ship Date	Ship Mode	Customer ID	Customer Name	Segment	Country	City	...	Sub-Category	Product Name	Sales	Shipping Days	Order Year	Order Month	Order Month Name	Sales Category	Fast Shipping	Customer Info
0	1	CA-2017-152156	2017-11-08	2017-11-11	Second Class	CG-12520	Claire Gute	Consumer	United States	Henderson	...	Bookcases	Bush Somerset Collection Bookcase	261.9600	3	2017	11	November	Medium	No	Claire Gute(Consumer)
1	2	CA-2017-152156	2017-11-08	2017-11-11	Second Class	CG-12520	Claire Gute	Consumer	United States	Henderson	...	Chairs	Hon Deluxe Fabric Upholstered Stacking Chairs,...	731.9400	3	2017	11	November	High	No	Claire Gute(Consumer)
2	3	CA-2017-138688	2017-06-12	2017-06-16	Second Class	DV-13045	Darrin Van Huff	Corporate	United States	Los Angeles	...	Labels	Self-Adhesive Address Labels for Typewriters b...	14.6200	4	2017	6	June	Low	No	Darrin Van Huff(Corporate)
3	4	US-2016-108966	2016-10-11	2016-10-18	Standard Class	SO-20335	Sean O'Donnell	Consumer	United States	Fort Lauderdale	...	Tables	Bretford CR4500 Series Slim Rectangular Table	957.5775	7	2016	10	October	High	No	Sean O'Donnell(Consumer)
4	5	US-2016-108966	2016-10-11	2016-10-18	Standard Class	SO-20335	Sean O'Donnell	Consumer	United States	Fort Lauderdale	...	Storage	Eldon Fold 'N Roll Cart System	22.3680	7	2016	10	October	Low	No	Sean O'Donnell(Consumer)
5 rows × 25 columns