In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("/home/tridip/my-jupyter-env/Auto Sales data.csv")

# Display basic info
print("\n Basic Info:")
print(df.info())

# Show sample data
print("\n Sample Rows:")
print(df.head())

# Check for missing values
print("\n Missing Values:")
print(df.isnull().sum())

# Convert ORDERDATE to datetime
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'], format='%d/%m/%Y')

# Convert categorical columns to 'category' dtype
categorical_cols = ['STATUS', 'PRODUCTLINE', 'PRODUCTCODE', 'CUSTOMERNAME', 
                    'COUNTRY', 'CITY', 'DEALSIZE', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME']
for col in categorical_cols:
    df[col] = df[col].astype('category')

# Extract date features
df['Year'] = df['ORDERDATE'].dt.year
df['Month'] = df['ORDERDATE'].dt.month
df['DayOfWeek'] = df['ORDERDATE'].dt.dayofweek
df['MonthName'] = df['ORDERDATE'].dt.month_name()

# Calculate discount and discount percentage
df['Discount'] = df['MSRP'] - df['PRICEEACH']
df['DiscountPct'] = ((df['MSRP'] - df['PRICEEACH']) / df['MSRP']).round(2)

print("\n Final Data Overview:")
print(df[['ORDERNUMBER', 'PRODUCTLINE', 'SALES', 'PRICEEACH', 'MSRP', 
          'Discount', 'DiscountPct', 'Year', 'Month', 'DayOfWeek']].head())

# Optional: Save cleaned version
df.to_csv("Auto_Sales_Cleaned.csv", index=False)



 Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2747 entries, 0 to 2746
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ORDERNUMBER           2747 non-null   int64  
 1   QUANTITYORDERED       2747 non-null   int64  
 2   PRICEEACH             2747 non-null   float64
 3   ORDERLINENUMBER       2747 non-null   int64  
 4   SALES                 2747 non-null   float64
 5   ORDERDATE             2747 non-null   object 
 6   DAYS_SINCE_LASTORDER  2747 non-null   int64  
 7   STATUS                2747 non-null   object 
 8   PRODUCTLINE           2747 non-null   object 
 9   MSRP                  2747 non-null   int64  
 10  PRODUCTCODE           2747 non-null   object 
 11  CUSTOMERNAME          2747 non-null   object 
 12  PHONE                 2747 non-null   object 
 13  ADDRESSLINE1          2747 non-null   object 
 14  CITY                  2747 non-null   object 
 15  POSTALC