# Feature Engineering

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
pd.set_option('future.no_silent_downcasting', True)

In [2]:
dataset = pd.read_csv('customer_purchase_data.csv')

In [3]:
dataset['Gender'] = dataset['Gender'].fillna('Male')
dataset['Purchase Date'] = pd.to_datetime(dataset['Purchase Date'])

In [4]:
# Add Recency feature: Days since last purchase
current_date = pd.to_datetime('today')
dataset['Recency'] = (current_date - dataset.groupby('Customer ID')['Purchase Date'].transform('max')).dt.days

In [5]:
# Add Frequency feature: Number of purchases per customer
dataset['Frequency'] = dataset.groupby('Customer ID')['Customer ID'].transform('count')

In [6]:
# Add Monetary (CLV) feature: Total spend per customer
dataset['Monetary'] = dataset.groupby('Customer ID')['Total Price'].transform('sum')

In [7]:
# Add Churn Indicator: If recency is greater than a threshold (e.g., 180 days, representing 6 months)
dataset['Churn'] = np.where(dataset['Recency'] > 180, 1, 0)

In [8]:
# Add Favorite Product Type: Most frequent product type purchased by each customer
dataset['Favorite Product Type'] = dataset.groupby('Customer ID')['Product Type'].transform(lambda x: x.mode()[0])

In [9]:
# Add Product Diversity: Number of unique products purchased by each customer
dataset['Product Diversity'] = dataset.groupby('Customer ID')['Product Type'].transform('nunique')

In [10]:
# Add Cancellation Rate: Number of time order cancelled by each customer
dataset['Is_Cancelled'] = dataset['Order Status'].apply(lambda x: 1 if x == 'Cancelled' else 0)
dataset['Total Orders'] = dataset.groupby('Customer ID')['Order Status'].transform('count')
dataset['Cancelled Orders'] = dataset.groupby('Customer ID')['Is_Cancelled'].transform('sum')
dataset['Cancellation Rate'] = (dataset['Cancelled Orders'] / dataset['Total Orders']) * 100
dataset.drop(columns=['Is_Cancelled', 'Cancelled Orders'], inplace=True)

In [11]:
# Add Rating Trends: Average rating per customer
dataset['Average Rating'] = dataset.groupby('Customer ID')['Rating'].transform('mean')

In [12]:
# Add Add-on Frequency: Percentage of transactions with add-ons
dataset['Add-on Frequency'] = (dataset['Add-on Total'] > 0).astype(int)
dataset['Add-on Frequency'] = dataset.groupby('Customer ID')['Add-on Frequency'].transform('mean')

In [13]:
# Add Preferred Payment Method: Most frequent payment method per customer
dataset['Preferred Payment Method'] = dataset.groupby('Customer ID')['Payment Method'].transform(lambda x: x.mode()[0])

In [14]:
# Add Preferred Shipping Type: Most frequent shipping type per customer
dataset['Preferred Shipping Type'] = dataset.groupby('Customer ID')['Shipping Type'].transform(lambda x: x.mode()[0])

In [15]:
customer_data = dataset.drop_duplicates(subset='Customer ID', keep='last')

In [16]:
dataset = customer_data.drop(['Product Type', 'SKU', 'Rating', 'Order Status', 'Payment Method', 
                    'Total Price', 'Unit Price', 'Quantity', 'Purchase Date', 'Shipping Type', 
                    'Add-ons Purchased', 'Add-on Total'], axis=1)

In [17]:
dataset.head()

Unnamed: 0,Customer ID,Age,Gender,Loyalty Member,Recency,Frequency,Monetary,Churn,Favorite Product Type,Product Diversity,Total Orders,Cancellation Rate,Average Rating,Add-on Frequency,Preferred Payment Method,Preferred Shipping Type
1,1000,53,Male,No,229,2,6279.42,1,Smartphone,2,2,50.0,2.5,1.0,Credit Card,Overnight
3,1002,41,Male,Yes,118,2,5020.6,0,Laptop,2,2,0.0,2.5,0.5,Cash,Express
4,1003,75,Male,Yes,198,1,41.5,1,Smartphone,1,1,0.0,5.0,1.0,Cash,Express
5,1004,41,Female,No,193,1,83.0,1,Smartphone,1,1,0.0,5.0,1.0,Credit Card,Standard
7,1005,25,Female,No,164,2,11779.11,0,Laptop,2,2,0.0,3.0,0.5,Debit Card,Overnight


In [18]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12136 entries, 1 to 19999
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Customer ID               12136 non-null  int64  
 1   Age                       12136 non-null  int64  
 2   Gender                    12136 non-null  object 
 3   Loyalty Member            12136 non-null  object 
 4   Recency                   12136 non-null  int64  
 5   Frequency                 12136 non-null  int64  
 6   Monetary                  12136 non-null  float64
 7   Churn                     12136 non-null  int64  
 8   Favorite Product Type     12136 non-null  object 
 9   Product Diversity         12136 non-null  int64  
 10  Total Orders              12136 non-null  int64  
 11  Cancellation Rate         12136 non-null  float64
 12  Average Rating            12136 non-null  float64
 13  Add-on Frequency          12136 non-null  float64
 14  Preferred P

In [19]:
dataset.to_csv('new_features_data.csv', index=False)