### Feature Extraction

In [2]:
#importing librbares
import pandas as pd
import numpy as np


In [4]:
# reading csv

df=pd.read_csv('retail_customer.csv')
df.head()

Unnamed: 0,customer_id,name,age,gender,signup_date,last_purchase_date,total_purchases,total_spent,country,email,device_type,is_subscribed,feedback_score
0,5c7825f3-13bb-4843-9cf1-5b7a944def13,Andrew Miller,56,Male,2025-01-01,2024-06-12,18,6143.01,Bangladesh,qwilson@example.org,Mobile,1,2.7
1,ad1cf2e3-be49-4031-9770-c308b2d1292c,Kevin Ramos,69,Other,2025-05-16,2024-10-03,86,9498.14,Eritrea,shall@example.net,Mobile,1,1.6
2,33194def-ebb0-4d75-8942-9c83d7d579f2,John Smith,46,Other,2025-04-15,2025-04-27,34,4026.83,Russian Federation,leebrian@example.org,Mobile,0,3.7
3,6a1ce340-ec27-482e-a339-e89e2db11a01,Dustin Nolan,32,Male,2023-10-10,2024-10-20,8,9545.46,Serbia,sguzman@example.org,Mobile,1,1.9
4,3e529d86-76ed-4c85-83cf-c5ab446cb3c0,Amy Johnson,60,Female,2024-02-02,2025-03-03,40,1436.51,Mauritius,griffithsarah@example.org,Mobile,1,1.4


In [5]:
df.shape

(500, 13)

In [6]:
df.dtypes

customer_id            object
name                   object
age                     int64
gender                 object
signup_date            object
last_purchase_date     object
total_purchases         int64
total_spent           float64
country                object
email                  object
device_type            object
is_subscribed           int64
feedback_score        float64
dtype: object

In [8]:
# binning age into categories

bins=[10, 30, 50, 70]  # age ranges
labels=["Young","Mid","Seniors"] # categories
df["age_group"] =pd.cut(df["age"],bins=bins, labels=labels) #cut creates equal size of bins

# Displaying the first few rows
df[['age','age_group']].head(10)

Unnamed: 0,age,age_group
0,56,Seniors
1,69,Seniors
2,46,Mid
3,32,Mid
4,60,Seniors
5,25,Young
6,38,Mid
7,56,Seniors
8,36,Mid
9,40,Mid


In [9]:
# converting dates cloumns to datetime

df['signup_date']=pd.to_datetime(df['signup_date'])
df['last_purchase_date']=pd.to_datetime(df['last_purchase_date'])

In [10]:
df.dtypes

customer_id                   object
name                          object
age                            int64
gender                        object
signup_date           datetime64[ns]
last_purchase_date    datetime64[ns]
total_purchases                int64
total_spent                  float64
country                       object
email                         object
device_type                   object
is_subscribed                  int64
feedback_score               float64
age_group                   category
dtype: object

In [None]:
# Extract Time based features
df['signup_year']=df['signup_date'].dt.year  #.dt function for extracting year/month/week/day
df['signup_month']=df['signup_date'].dt.month
df['days_since_signup']=(pd.Timestamp.today()-df['signup_date']).dt.days

df['days_since_last_purchase']=(pd.Timestamp.today() - df['last_purchase_date']).dt.days

In [12]:
# name features
df['first_name']=df['name'].apply(lambda x:x.split()[0])
df['last_name']=df['name'].apply(lambda x: x.split()[-1])
df['name_length']=df['name'].apply(len)

In [13]:
# High spender flag

df['is_high_spender'] =(df['total_spent']>5000).astype(int)

In [16]:
df

Unnamed: 0,customer_id,name,age,gender,signup_date,last_purchase_date,total_purchases,total_spent,country,email,...,feedback_score,age_group,signup_year,signup_month,days_since_signup,days_since_last_purchase,first_name,last_name,name_length,is_high_spender
0,5c7825f3-13bb-4843-9cf1-5b7a944def13,Andrew Miller,56,Male,2025-01-01,2024-06-12,18,6143.01,Bangladesh,qwilson@example.org,...,2.7,Seniors,2025,1,365,568,Andrew,Miller,13,1
1,ad1cf2e3-be49-4031-9770-c308b2d1292c,Kevin Ramos,69,Other,2025-05-16,2024-10-03,86,9498.14,Eritrea,shall@example.net,...,1.6,Seniors,2025,5,230,455,Kevin,Ramos,11,1
2,33194def-ebb0-4d75-8942-9c83d7d579f2,John Smith,46,Other,2025-04-15,2025-04-27,34,4026.83,Russian Federation,leebrian@example.org,...,3.7,Mid,2025,4,261,249,John,Smith,10,0
3,6a1ce340-ec27-482e-a339-e89e2db11a01,Dustin Nolan,32,Male,2023-10-10,2024-10-20,8,9545.46,Serbia,sguzman@example.org,...,1.9,Mid,2023,10,814,438,Dustin,Nolan,12,1
4,3e529d86-76ed-4c85-83cf-c5ab446cb3c0,Amy Johnson,60,Female,2024-02-02,2025-03-03,40,1436.51,Mauritius,griffithsarah@example.org,...,1.4,Seniors,2024,2,699,304,Amy,Johnson,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,cae07587-71ec-4ad9-95d3-1733b04039f6,Natalie Clark,65,Male,2024-04-12,2023-09-30,40,3550.88,Christmas Island,coxelizabeth@example.com,...,2.1,Seniors,2024,4,629,824,Natalie,Clark,13,0
496,91c6672b-6ace-4367-9e31-8bd1d8fea975,Jennifer Jones,42,Male,2024-07-18,2023-10-23,11,1676.95,Vanuatu,curtisjesse@example.org,...,3.7,Mid,2024,7,532,801,Jennifer,Jones,14,0
497,9521ecfa-af06-4865-9a4b-e70827ba2485,Jeremy Luna,57,Male,2025-04-20,2023-10-27,36,6089.04,Nauru,uwalker@example.org,...,1.1,Seniors,2025,4,256,797,Jeremy,Luna,11,1
498,28da13d6-c253-4279-b3aa-e41599598e9f,Leah Williams,62,Female,2022-06-20,2024-08-20,59,3249.30,British Virgin Islands,perryjacob@example.com,...,2.2,Seniors,2022,6,1291,499,Leah,Williams,13,0


In [23]:
# low spender 

df['is_low_spender']=(df['total_spent']<1000).astype(int)

In [25]:
df

Unnamed: 0,customer_id,name,age,gender,signup_date,last_purchase_date,total_purchases,total_spent,country,email,...,signup_year,signup_month,days_since_signup,days_since_last_purchase,first_name,last_name,name_length,is_high_spender,low_spender,is_low_spender
0,5c7825f3-13bb-4843-9cf1-5b7a944def13,Andrew Miller,56,Male,2025-01-01,2024-06-12,18,6143.01,Bangladesh,qwilson@example.org,...,2025,1,365,568,Andrew,Miller,13,1,0,0
1,ad1cf2e3-be49-4031-9770-c308b2d1292c,Kevin Ramos,69,Other,2025-05-16,2024-10-03,86,9498.14,Eritrea,shall@example.net,...,2025,5,230,455,Kevin,Ramos,11,1,0,0
2,33194def-ebb0-4d75-8942-9c83d7d579f2,John Smith,46,Other,2025-04-15,2025-04-27,34,4026.83,Russian Federation,leebrian@example.org,...,2025,4,261,249,John,Smith,10,0,0,0
3,6a1ce340-ec27-482e-a339-e89e2db11a01,Dustin Nolan,32,Male,2023-10-10,2024-10-20,8,9545.46,Serbia,sguzman@example.org,...,2023,10,814,438,Dustin,Nolan,12,1,0,0
4,3e529d86-76ed-4c85-83cf-c5ab446cb3c0,Amy Johnson,60,Female,2024-02-02,2025-03-03,40,1436.51,Mauritius,griffithsarah@example.org,...,2024,2,699,304,Amy,Johnson,11,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,cae07587-71ec-4ad9-95d3-1733b04039f6,Natalie Clark,65,Male,2024-04-12,2023-09-30,40,3550.88,Christmas Island,coxelizabeth@example.com,...,2024,4,629,824,Natalie,Clark,13,0,0,0
496,91c6672b-6ace-4367-9e31-8bd1d8fea975,Jennifer Jones,42,Male,2024-07-18,2023-10-23,11,1676.95,Vanuatu,curtisjesse@example.org,...,2024,7,532,801,Jennifer,Jones,14,0,0,0
497,9521ecfa-af06-4865-9a4b-e70827ba2485,Jeremy Luna,57,Male,2025-04-20,2023-10-27,36,6089.04,Nauru,uwalker@example.org,...,2025,4,256,797,Jeremy,Luna,11,1,0,0
498,28da13d6-c253-4279-b3aa-e41599598e9f,Leah Williams,62,Female,2022-06-20,2024-08-20,59,3249.30,British Virgin Islands,perryjacob@example.com,...,2022,6,1291,499,Leah,Williams,13,0,0,0


In [24]:
df[['is_high_spender','is_low_spender']].head(10)

Unnamed: 0,is_high_spender,is_low_spender
0,1,0
1,1,0
2,0,0
3,1,0
4,0,0
5,0,0
6,0,0
7,1,0
8,0,0
9,1,0
