#TASK 1

Load the dataset

In [177]:
import pandas as pd
df=pd.read_csv('/content/ecommerce_customers_large.csv')
df

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,BrowserType,PurchaseFrequency,LastPurchaseDaysAgo,DeviceType
0,1001,23,Male,118266,98,Edge,4,31,Tablet
1,1002,57,Female,71930,39,Edge,12,10,Mobile
2,1003,50,Female,113230,43,Opera,16,9,Mobile
3,1004,50,Female,69083,15,Chrome,1,50,Mobile
4,1005,50,Female,109557,52,Firefox,13,4,Desktop
...,...,...,...,...,...,...,...,...,...
115,1116,43,Male,63875,40,Chrome,6,2,Desktop
116,1117,58,Female,48760,43,Edge,6,6,Tablet
117,1118,59,Female,76213,28,Opera,18,38,Desktop
118,1119,48,Male,61526,15,Safari,14,5,Desktop


**Initial Exploration**

In [178]:
df.head()

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,BrowserType,PurchaseFrequency,LastPurchaseDaysAgo,DeviceType
0,1001,23,Male,118266,98,Edge,4,31,Tablet
1,1002,57,Female,71930,39,Edge,12,10,Mobile
2,1003,50,Female,113230,43,Opera,16,9,Mobile
3,1004,50,Female,69083,15,Chrome,1,50,Mobile
4,1005,50,Female,109557,52,Firefox,13,4,Desktop


In [179]:
df.columns.tolist()

['CustomerID',
 'Age',
 'Gender',
 'AnnualIncome',
 'SpendingScore',
 'BrowserType',
 'PurchaseFrequency',
 'LastPurchaseDaysAgo',
 'DeviceType']

**Dataset Shape & Memory Usage**

In [180]:
# Dataset Shape
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")

# Memory usage
print("\nMemory Usage:")
print(df.info(memory_usage='deep'))

Dataset contains 120 rows and 9 columns.

Memory Usage:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   CustomerID           120 non-null    int64 
 1   Age                  120 non-null    int64 
 2   Gender               120 non-null    object
 3   AnnualIncome         120 non-null    int64 
 4   SpendingScore        120 non-null    int64 
 5   BrowserType          120 non-null    object
 6   PurchaseFrequency    120 non-null    int64 
 7   LastPurchaseDaysAgo  120 non-null    int64 
 8   DeviceType           120 non-null    object
dtypes: int64(6), object(3)
memory usage: 27.8 KB
None


**Missing Values Analysis**

In [181]:
missing = df.isnull().sum()
missing = missing[missing > 0]
print(missing.to_frame('Missing Values').assign(Percentage=lambda x: (x['Missing Values'] / len(df) * 100).round(2)))

Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


**Data Types Assessment**

In [182]:
print("\nData Types:")
print(df.dtypes)


Data Types:
CustomerID              int64
Age                     int64
Gender                 object
AnnualIncome            int64
SpendingScore           int64
BrowserType            object
PurchaseFrequency       int64
LastPurchaseDaysAgo     int64
DeviceType             object
dtype: object


**Duplicate Records Count**

In [183]:
duplicates = df.duplicated().sum()
print(f"\nTotal duplicate records: {duplicates}")


Total duplicate records: 0


**Basic Statistical Summary**

In [184]:
print("\nBasic Statistical Summary (Numerical Columns):")
display(df.describe())


Basic Statistical Summary (Numerical Columns):


Unnamed: 0,CustomerID,Age,AnnualIncome,SpendingScore,PurchaseFrequency,LastPurchaseDaysAgo
count,120.0,120.0,120.0,120.0,120.0,120.0
mean,1060.5,42.675,68623.966667,48.266667,10.466667,26.858333
std,34.785054,13.20629,29992.299637,27.21336,5.489384,18.204393
min,1001.0,19.0,20178.0,1.0,1.0,0.0
25%,1030.75,31.75,43615.5,28.0,6.0,10.0
50%,1060.5,43.0,67643.5,48.0,11.0,26.0
75%,1090.25,52.25,92256.25,67.0,16.0,43.0
max,1120.0,65.0,119184.0,100.0,20.0,60.0


**Document Data Quality Issues Check**

In [185]:
issues = []

if df.isnull().sum().sum(): issues.append("Missing values found.")
if df.duplicated().sum(): issues.append("Duplicate rows found.")

for col in df.select_dtypes('object'):
    if (df[col].dropna().str.strip() == '').any():
        issues.append(f"Blank values in '{col}'.")

print("\nIssues:" if issues else "\nNo data issues.")
for i, msg in enumerate(issues, 1):
    print(f"{i}. {msg}")



No data issues.


#TASK 2#

Handle Missing Values

In [186]:
# Drop columns with >50% missing
df.dropna(thresh=0.5*len(df), axis=1, inplace=True)

# Fill numeric with median, categorical with mode
df.fillna({**df.select_dtypes('number').median(),
           **df.select_dtypes('object').mode().iloc[0]}, inplace=True)

**Remove Duplicates**

In [187]:
df = df.drop_duplicates()

**Fix Data Types**

In [188]:
# Date conversion
df['order_date'] = pd.to_datetime(df.get('order_date'), errors='coerce')

# Convert object columns to numeric where possible
df = df.apply(lambda col: pd.to_numeric(col, errors='ignore') if col.dtype == 'object' else col)

# Convert remaining object columns to category
df[df.select_dtypes('object').columns] = df.select_dtypes('object').astype('category')
df

  df = df.apply(lambda col: pd.to_numeric(col, errors='ignore') if col.dtype == 'object' else col)


Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,BrowserType,PurchaseFrequency,LastPurchaseDaysAgo,DeviceType,order_date
0,1001,23,Male,118266,98,Edge,4,31,Tablet,
1,1002,57,Female,71930,39,Edge,12,10,Mobile,
2,1003,50,Female,113230,43,Opera,16,9,Mobile,
3,1004,50,Female,69083,15,Chrome,1,50,Mobile,
4,1005,50,Female,109557,52,Firefox,13,4,Desktop,
...,...,...,...,...,...,...,...,...,...,...
115,1116,43,Male,63875,40,Chrome,6,2,Desktop,
116,1117,58,Female,48760,43,Edge,6,6,Tablet,
117,1118,59,Female,76213,28,Opera,18,38,Desktop,
118,1119,48,Male,61526,15,Safari,14,5,Desktop,


In [189]:
# Fill NaT with a default date (example: Jan 1, 2020)
df['order_date'] = df['order_date'].fillna(pd.Timestamp("2020-01-01"))
df

  df['order_date'] = df['order_date'].fillna(pd.Timestamp("2020-01-01"))


Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,BrowserType,PurchaseFrequency,LastPurchaseDaysAgo,DeviceType,order_date
0,1001,23,Male,118266,98,Edge,4,31,Tablet,2020-01-01
1,1002,57,Female,71930,39,Edge,12,10,Mobile,2020-01-01
2,1003,50,Female,113230,43,Opera,16,9,Mobile,2020-01-01
3,1004,50,Female,69083,15,Chrome,1,50,Mobile,2020-01-01
4,1005,50,Female,109557,52,Firefox,13,4,Desktop,2020-01-01
...,...,...,...,...,...,...,...,...,...,...
115,1116,43,Male,63875,40,Chrome,6,2,Desktop,2020-01-01
116,1117,58,Female,48760,43,Edge,6,6,Tablet,2020-01-01
117,1118,59,Female,76213,28,Opera,18,38,Desktop,2020-01-01
118,1119,48,Male,61526,15,Safari,14,5,Desktop,2020-01-01


**Clean Text Data**

In [190]:
# Function to clean text
def clean_text(s):
    if isinstance(s, str):
        return s.strip().title()
    return s

# Apply to all object columns
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].apply(clean_text)

# Clean phone numbers & emails (example placeholders)
# df['phone'] = df['phone'].str.replace(r'\D', '', regex=True)
# df['email'] = df['email'].str.lower().str.strip()


In [191]:
df

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,BrowserType,PurchaseFrequency,LastPurchaseDaysAgo,DeviceType,order_date
0,1001,23,Male,118266,98,Edge,4,31,Tablet,2020-01-01
1,1002,57,Female,71930,39,Edge,12,10,Mobile,2020-01-01
2,1003,50,Female,113230,43,Opera,16,9,Mobile,2020-01-01
3,1004,50,Female,69083,15,Chrome,1,50,Mobile,2020-01-01
4,1005,50,Female,109557,52,Firefox,13,4,Desktop,2020-01-01
...,...,...,...,...,...,...,...,...,...,...
115,1116,43,Male,63875,40,Chrome,6,2,Desktop,2020-01-01
116,1117,58,Female,48760,43,Edge,6,6,Tablet,2020-01-01
117,1118,59,Female,76213,28,Opera,18,38,Desktop,2020-01-01
118,1119,48,Male,61526,15,Safari,14,5,Desktop,2020-01-01


#TASK 3

Outlier Detection and Treatment

In [192]:
import pandas as pd
import numpy as np
from scipy import stats

# Select numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
print("Numerical columns:", num_cols.tolist())

Numerical columns: ['CustomerID', 'Age', 'AnnualIncome', 'SpendingScore', 'PurchaseFrequency', 'LastPurchaseDaysAgo']


**Identify Outliers using IQR Method**

In [193]:
# IQR method
def detect_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[col] < lower_bound) | (data[col] > upper_bound)]

# Example: Detect outliers in 'AnnualIncome'
outliers_income = detect_outliers_iqr(df, 'AnnualIncome')
print(f"Outliers in 'AnnualIncome': {len(outliers_income)} rows")

Outliers in 'AnnualIncome': 0 rows


**Identify Outliers using Z-Score Method**

In [194]:
# Z-score method
def detect_outliers_zscore(data, col, threshold=3):
    z_scores = stats.zscore(data[col])
    return data[(np.abs(z_scores) > threshold)]

# Example: Z-score outliers for 'SpendingScore'
outliers_spending = detect_outliers_zscore(df, 'SpendingScore')
print(f"Outliers in 'SpendingScore': {len(outliers_spending)} rows")

Outliers in 'SpendingScore': 0 rows


**Decide: Remove, Cap, or Keep Outliers**

In [195]:
# Cap function using IQR bounds
def cap_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    data[col] = np.where(data[col] < lower, lower, data[col])
    data[col] = np.where(data[col] > upper, upper, data[col])
    return data

# Apply capping to selected columns
for col in ['AnnualIncome', 'SpendingScore']:
    df = cap_outliers_iqr(df, col)
df

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,BrowserType,PurchaseFrequency,LastPurchaseDaysAgo,DeviceType,order_date
0,1001,23,Male,118266.0,98.0,Edge,4,31,Tablet,2020-01-01
1,1002,57,Female,71930.0,39.0,Edge,12,10,Mobile,2020-01-01
2,1003,50,Female,113230.0,43.0,Opera,16,9,Mobile,2020-01-01
3,1004,50,Female,69083.0,15.0,Chrome,1,50,Mobile,2020-01-01
4,1005,50,Female,109557.0,52.0,Firefox,13,4,Desktop,2020-01-01
...,...,...,...,...,...,...,...,...,...,...
115,1116,43,Male,63875.0,40.0,Chrome,6,2,Desktop,2020-01-01
116,1117,58,Female,48760.0,43.0,Edge,6,6,Tablet,2020-01-01
117,1118,59,Female,76213.0,28.0,Opera,18,38,Desktop,2020-01-01
118,1119,48,Male,61526.0,15.0,Safari,14,5,Desktop,2020-01-01


In [196]:
# Remove rows with extreme income outliers
df = df[~df.index.isin(outliers_income.index)]

**Document Decisions (Business Justification - example)**

In [197]:
print("\n📌 Business Justification:")
print("""
1. We capped extreme values in 'AnnualIncome' and 'SpendingScore' to reduce the influence of outliers on analysis.
2. Capping is preferred over removal to retain customer data and avoid information loss.
3. Z-score helped double-check extreme cases; however, we used IQR for final treatment due to its robustness.
""")



📌 Business Justification:

1. We capped extreme values in 'AnnualIncome' and 'SpendingScore' to reduce the influence of outliers on analysis.
2. Capping is preferred over removal to retain customer data and avoid information loss.
3. Z-score helped double-check extreme cases; however, we used IQR for final treatment due to its robustness.



#TASK 4

Feature Engineering Code

In [198]:
import pandas as pd
from datetime import datetime

**Customer Tenure (days since registration)**

In [199]:
# Create fake registration_date for demo (if not available)
import numpy as np
df['registration_date'] = pd.to_datetime(np.random.choice(pd.date_range(start='2018-01-01', end='2022-01-01'), size=len(df)))

# Calculate customer tenure
today = pd.to_datetime("2025-01-01")  # or use datetime.today()
df['CustomerTenure'] = (today - df['registration_date']).dt.days
df

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,BrowserType,PurchaseFrequency,LastPurchaseDaysAgo,DeviceType,order_date,registration_date,CustomerTenure
0,1001,23,Male,118266.0,98.0,Edge,4,31,Tablet,2020-01-01,2020-06-04,1672
1,1002,57,Female,71930.0,39.0,Edge,12,10,Mobile,2020-01-01,2020-11-06,1517
2,1003,50,Female,113230.0,43.0,Opera,16,9,Mobile,2020-01-01,2021-12-18,1110
3,1004,50,Female,69083.0,15.0,Chrome,1,50,Mobile,2020-01-01,2021-03-22,1381
4,1005,50,Female,109557.0,52.0,Firefox,13,4,Desktop,2020-01-01,2020-08-07,1608
...,...,...,...,...,...,...,...,...,...,...,...,...
115,1116,43,Male,63875.0,40.0,Chrome,6,2,Desktop,2020-01-01,2021-06-25,1286
116,1117,58,Female,48760.0,43.0,Edge,6,6,Tablet,2020-01-01,2020-01-17,1811
117,1118,59,Female,76213.0,28.0,Opera,18,38,Desktop,2020-01-01,2020-02-21,1776
118,1119,48,Male,61526.0,15.0,Safari,14,5,Desktop,2020-01-01,2018-04-10,2458


**Average Order Value**

In [200]:
# Create dummy total purchase amount if not available
df['TotalPurchaseAmount'] = df['SpendingScore'] * df['PurchaseFrequency'] * 10

# Calculate average order value
df['AvgOrderValue'] = df['TotalPurchaseAmount'] / df['PurchaseFrequency']
df

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,BrowserType,PurchaseFrequency,LastPurchaseDaysAgo,DeviceType,order_date,registration_date,CustomerTenure,TotalPurchaseAmount,AvgOrderValue
0,1001,23,Male,118266.0,98.0,Edge,4,31,Tablet,2020-01-01,2020-06-04,1672,3920.0,980.0
1,1002,57,Female,71930.0,39.0,Edge,12,10,Mobile,2020-01-01,2020-11-06,1517,4680.0,390.0
2,1003,50,Female,113230.0,43.0,Opera,16,9,Mobile,2020-01-01,2021-12-18,1110,6880.0,430.0
3,1004,50,Female,69083.0,15.0,Chrome,1,50,Mobile,2020-01-01,2021-03-22,1381,150.0,150.0
4,1005,50,Female,109557.0,52.0,Firefox,13,4,Desktop,2020-01-01,2020-08-07,1608,6760.0,520.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,1116,43,Male,63875.0,40.0,Chrome,6,2,Desktop,2020-01-01,2021-06-25,1286,2400.0,400.0
116,1117,58,Female,48760.0,43.0,Edge,6,6,Tablet,2020-01-01,2020-01-17,1811,2580.0,430.0
117,1118,59,Female,76213.0,28.0,Opera,18,38,Desktop,2020-01-01,2020-02-21,1776,5040.0,280.0
118,1119,48,Male,61526.0,15.0,Safari,14,5,Desktop,2020-01-01,2018-04-10,2458,2100.0,150.0


**Days Since Last Order**

In [201]:
df['DaysSinceLastOrder'] = df['LastPurchaseDaysAgo']  # Already exists
df

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,BrowserType,PurchaseFrequency,LastPurchaseDaysAgo,DeviceType,order_date,registration_date,CustomerTenure,TotalPurchaseAmount,AvgOrderValue,DaysSinceLastOrder
0,1001,23,Male,118266.0,98.0,Edge,4,31,Tablet,2020-01-01,2020-06-04,1672,3920.0,980.0,31
1,1002,57,Female,71930.0,39.0,Edge,12,10,Mobile,2020-01-01,2020-11-06,1517,4680.0,390.0,10
2,1003,50,Female,113230.0,43.0,Opera,16,9,Mobile,2020-01-01,2021-12-18,1110,6880.0,430.0,9
3,1004,50,Female,69083.0,15.0,Chrome,1,50,Mobile,2020-01-01,2021-03-22,1381,150.0,150.0,50
4,1005,50,Female,109557.0,52.0,Firefox,13,4,Desktop,2020-01-01,2020-08-07,1608,6760.0,520.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,1116,43,Male,63875.0,40.0,Chrome,6,2,Desktop,2020-01-01,2021-06-25,1286,2400.0,400.0,2
116,1117,58,Female,48760.0,43.0,Edge,6,6,Tablet,2020-01-01,2020-01-17,1811,2580.0,430.0,6
117,1118,59,Female,76213.0,28.0,Opera,18,38,Desktop,2020-01-01,2020-02-21,1776,5040.0,280.0,38
118,1119,48,Male,61526.0,15.0,Safari,14,5,Desktop,2020-01-01,2018-04-10,2458,2100.0,150.0,5


**Age Groups**

In [202]:
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 25, 40, 60, 100],
                        labels=['Youth', 'Young Adult', 'Adult', 'Senior'])
df

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,BrowserType,PurchaseFrequency,LastPurchaseDaysAgo,DeviceType,order_date,registration_date,CustomerTenure,TotalPurchaseAmount,AvgOrderValue,DaysSinceLastOrder,AgeGroup
0,1001,23,Male,118266.0,98.0,Edge,4,31,Tablet,2020-01-01,2020-06-04,1672,3920.0,980.0,31,Youth
1,1002,57,Female,71930.0,39.0,Edge,12,10,Mobile,2020-01-01,2020-11-06,1517,4680.0,390.0,10,Adult
2,1003,50,Female,113230.0,43.0,Opera,16,9,Mobile,2020-01-01,2021-12-18,1110,6880.0,430.0,9,Adult
3,1004,50,Female,69083.0,15.0,Chrome,1,50,Mobile,2020-01-01,2021-03-22,1381,150.0,150.0,50,Adult
4,1005,50,Female,109557.0,52.0,Firefox,13,4,Desktop,2020-01-01,2020-08-07,1608,6760.0,520.0,4,Adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,1116,43,Male,63875.0,40.0,Chrome,6,2,Desktop,2020-01-01,2021-06-25,1286,2400.0,400.0,2,Adult
116,1117,58,Female,48760.0,43.0,Edge,6,6,Tablet,2020-01-01,2020-01-17,1811,2580.0,430.0,6,Adult
117,1118,59,Female,76213.0,28.0,Opera,18,38,Desktop,2020-01-01,2020-02-21,1776,5040.0,280.0,38,Adult
118,1119,48,Male,61526.0,15.0,Safari,14,5,Desktop,2020-01-01,2018-04-10,2458,2100.0,150.0,5,Adult


**Income Brackets**

In [203]:
df['IncomeBracket'] = pd.cut(df['AnnualIncome'],
                             bins=[0, 40000, 80000, 120000, np.inf],
                             labels=['Low', 'Mid', 'High', 'Very High'])
df

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,BrowserType,PurchaseFrequency,LastPurchaseDaysAgo,DeviceType,order_date,registration_date,CustomerTenure,TotalPurchaseAmount,AvgOrderValue,DaysSinceLastOrder,AgeGroup,IncomeBracket
0,1001,23,Male,118266.0,98.0,Edge,4,31,Tablet,2020-01-01,2020-06-04,1672,3920.0,980.0,31,Youth,High
1,1002,57,Female,71930.0,39.0,Edge,12,10,Mobile,2020-01-01,2020-11-06,1517,4680.0,390.0,10,Adult,Mid
2,1003,50,Female,113230.0,43.0,Opera,16,9,Mobile,2020-01-01,2021-12-18,1110,6880.0,430.0,9,Adult,High
3,1004,50,Female,69083.0,15.0,Chrome,1,50,Mobile,2020-01-01,2021-03-22,1381,150.0,150.0,50,Adult,Mid
4,1005,50,Female,109557.0,52.0,Firefox,13,4,Desktop,2020-01-01,2020-08-07,1608,6760.0,520.0,4,Adult,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,1116,43,Male,63875.0,40.0,Chrome,6,2,Desktop,2020-01-01,2021-06-25,1286,2400.0,400.0,2,Adult,Mid
116,1117,58,Female,48760.0,43.0,Edge,6,6,Tablet,2020-01-01,2020-01-17,1811,2580.0,430.0,6,Adult,Mid
117,1118,59,Female,76213.0,28.0,Opera,18,38,Desktop,2020-01-01,2020-02-21,1776,5040.0,280.0,38,Adult,Mid
118,1119,48,Male,61526.0,15.0,Safari,14,5,Desktop,2020-01-01,2018-04-10,2458,2100.0,150.0,5,Adult,Mid


**Spending Categories**

In [204]:
df['SpendingCategory'] = pd.cut(df['SpendingScore'],
                                bins=[0, 30, 70, 100],
                                labels=['Low Spender', 'Medium Spender', 'High Spender'])
df

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,BrowserType,PurchaseFrequency,LastPurchaseDaysAgo,DeviceType,order_date,registration_date,CustomerTenure,TotalPurchaseAmount,AvgOrderValue,DaysSinceLastOrder,AgeGroup,IncomeBracket,SpendingCategory
0,1001,23,Male,118266.0,98.0,Edge,4,31,Tablet,2020-01-01,2020-06-04,1672,3920.0,980.0,31,Youth,High,High Spender
1,1002,57,Female,71930.0,39.0,Edge,12,10,Mobile,2020-01-01,2020-11-06,1517,4680.0,390.0,10,Adult,Mid,Medium Spender
2,1003,50,Female,113230.0,43.0,Opera,16,9,Mobile,2020-01-01,2021-12-18,1110,6880.0,430.0,9,Adult,High,Medium Spender
3,1004,50,Female,69083.0,15.0,Chrome,1,50,Mobile,2020-01-01,2021-03-22,1381,150.0,150.0,50,Adult,Mid,Low Spender
4,1005,50,Female,109557.0,52.0,Firefox,13,4,Desktop,2020-01-01,2020-08-07,1608,6760.0,520.0,4,Adult,High,Medium Spender
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,1116,43,Male,63875.0,40.0,Chrome,6,2,Desktop,2020-01-01,2021-06-25,1286,2400.0,400.0,2,Adult,Mid,Medium Spender
116,1117,58,Female,48760.0,43.0,Edge,6,6,Tablet,2020-01-01,2020-01-17,1811,2580.0,430.0,6,Adult,Mid,Medium Spender
117,1118,59,Female,76213.0,28.0,Opera,18,38,Desktop,2020-01-01,2020-02-21,1776,5040.0,280.0,38,Adult,Mid,Low Spender
118,1119,48,Male,61526.0,15.0,Safari,14,5,Desktop,2020-01-01,2018-04-10,2458,2100.0,150.0,5,Adult,Mid,Low Spender


**Optional: Interaction Feature (Example)**

In [205]:
# Income x SpendingScore (just an example interaction)
df['Income_Score'] = df['AnnualIncome'] * df['SpendingScore']
df

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,BrowserType,PurchaseFrequency,LastPurchaseDaysAgo,DeviceType,order_date,registration_date,CustomerTenure,TotalPurchaseAmount,AvgOrderValue,DaysSinceLastOrder,AgeGroup,IncomeBracket,SpendingCategory,Income_Score
0,1001,23,Male,118266.0,98.0,Edge,4,31,Tablet,2020-01-01,2020-06-04,1672,3920.0,980.0,31,Youth,High,High Spender,11590068.0
1,1002,57,Female,71930.0,39.0,Edge,12,10,Mobile,2020-01-01,2020-11-06,1517,4680.0,390.0,10,Adult,Mid,Medium Spender,2805270.0
2,1003,50,Female,113230.0,43.0,Opera,16,9,Mobile,2020-01-01,2021-12-18,1110,6880.0,430.0,9,Adult,High,Medium Spender,4868890.0
3,1004,50,Female,69083.0,15.0,Chrome,1,50,Mobile,2020-01-01,2021-03-22,1381,150.0,150.0,50,Adult,Mid,Low Spender,1036245.0
4,1005,50,Female,109557.0,52.0,Firefox,13,4,Desktop,2020-01-01,2020-08-07,1608,6760.0,520.0,4,Adult,High,Medium Spender,5696964.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,1116,43,Male,63875.0,40.0,Chrome,6,2,Desktop,2020-01-01,2021-06-25,1286,2400.0,400.0,2,Adult,Mid,Medium Spender,2555000.0
116,1117,58,Female,48760.0,43.0,Edge,6,6,Tablet,2020-01-01,2020-01-17,1811,2580.0,430.0,6,Adult,Mid,Medium Spender,2096680.0
117,1118,59,Female,76213.0,28.0,Opera,18,38,Desktop,2020-01-01,2020-02-21,1776,5040.0,280.0,38,Adult,Mid,Low Spender,2133964.0
118,1119,48,Male,61526.0,15.0,Safari,14,5,Desktop,2020-01-01,2018-04-10,2458,2100.0,150.0,5,Adult,Mid,Low Spender,922890.0


**Data Check**

In [206]:
df[['CustomerTenure', 'AvgOrderValue', 'DaysSinceLastOrder', 'AgeGroup',
    'IncomeBracket', 'SpendingCategory', 'Income_Score']].head()


Unnamed: 0,CustomerTenure,AvgOrderValue,DaysSinceLastOrder,AgeGroup,IncomeBracket,SpendingCategory,Income_Score
0,1672,980.0,31,Youth,High,High Spender,11590068.0
1,1517,390.0,10,Adult,Mid,Medium Spender,2805270.0
2,1110,430.0,9,Adult,High,Medium Spender,4868890.0
3,1381,150.0,50,Adult,Mid,Low Spender,1036245.0
4,1608,520.0,4,Adult,High,Medium Spender,5696964.0
