In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("E:\\merged_dataset.csv")

# Convert Purchase_Time to datetime
df['Purchase_Time'] = pd.to_datetime(df['Purchase_Time'], errors='coerce')

# Extract date components
df['Year'] = df['Purchase_Time'].dt.year
df['Month'] = df['Purchase_Time'].dt.month
df['Day'] = df['Purchase_Time'].dt.day
df['Hour'] = df['Purchase_Time'].dt.hour
df['Weekday'] = df['Purchase_Time'].dt.weekday  # Monday = 0, Sunday = 6
df['Is_Weekend'] = df['Weekday'].isin([5, 6]).astype(int)  # 1 for weekend

# Function to check if purchase falls in peak season
def is_peak_season(purchase_date, start, end):
    if pd.isna(start) or pd.isna(end):
        return 0
    try:
        start_month = pd.to_datetime(start, format='%B').month
        end_month = pd.to_datetime(end, format='%B').month
        return int(start_month <= purchase_date.month <= end_month)
    except:
        return 0

# Apply peak season function
df['Is_Peak_Season'] = df.apply(lambda row: is_peak_season(row['Purchase_Time'],
                                                           row.get('Peak_Season_Start', np.nan),
                                                           row.get('Peak_Season_End', np.nan)), axis=1)

# Price-related features
df['Discount_Amount'] = df['Original_Price'] - df['Discounted_Price']
df['Discount_Percentage'] = (df['Discount_Amount'] / df['Original_Price']) * 100

# Competitor price analysis
df['Avg_Competitor_Price'] = df[['Amazon_Price', 'Flipkart_Price', 'Myntra_Price', 'Ajio_Price', 'Snapdeal_Price']].mean(axis=1, skipna=True)
df['Price_Competitiveness'] = df['Discounted_Price'] / df['Avg_Competitor_Price']

# Customer behavior features
df['High_Cart_Abandonment'] = (df['Cart_Abandonment_Rate'] > 0.5).astype(int)

# Sales-related features
df['Revenue_per_Item'] = df['Revenue'] / df['Quantity_Sold']

# Inflation Categorization
df['Inflation_Category'] = pd.cut(df['Inflation_Rate'], bins=[0, 0.02, 0.05, 0.1, np.inf], labels=['Very Low', 'Low', 'Medium', 'High'])

# One-hot encoding categorical features
categorical_cols = ['Category_x', 'Subcategory_x', 'Payment_Mode', 'Economic_Condition']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Drop unnecessary columns
drop_cols = ['Peak_Season_Start', 'Peak_Season_End', 'Purchase_Time', 'Category_y', 'Subcategory_y']
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)

# Save the processed dataset
processed_file_path = "E:\\processed_dataset.csv"
df.to_csv(processed_file_path, index=False)

print(f"Processed dataset saved at: {processed_file_path}")


Processed dataset saved at: E:\processed_dataset.csv
