In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

In [6]:
df = pd.read_csv(r"E:\test1\data\bank-additional.csv", sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no


In [9]:
df.info()
print(df.shape)
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             4119 non-null   int64  
 1   job             4119 non-null   object 
 2   marital         4119 non-null   object 
 3   education       4119 non-null   object 
 4   default         4119 non-null   object 
 5   housing         4119 non-null   object 
 6   loan            4119 non-null   object 
 7   contact         4119 non-null   object 
 8   month           4119 non-null   object 
 9   day_of_week     4119 non-null   object 
 10  duration        4119 non-null   int64  
 11  campaign        4119 non-null   int64  
 12  pdays           4119 non-null   int64  
 13  previous        4119 non-null   int64  
 14  poutcome        4119 non-null   object 
 15  emp.var.rate    4119 non-null   float64
 16  cons.price.idx  4119 non-null   float64
 17  cons.conf.idx   4119 non-null   f

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [12]:
for col in ['job', 'marital', 'education', 'default', 'housing', 'loan']:
    print(f"{col}: {df[col].unique()}")

job: ['blue-collar' 'services' 'admin.' 'entrepreneur' 'self-employed'
 'technician' 'management' 'student' 'retired' 'housemaid' 'unemployed'
 'unknown']
marital: ['married' 'single' 'divorced' 'unknown']
education: ['basic.9y' 'high.school' 'university.degree' 'professional.course'
 'basic.6y' 'basic.4y' 'unknown' 'illiterate']
default: ['no' 'unknown' 'yes']
housing: ['yes' 'no' 'unknown']
loan: ['no' 'unknown' 'yes']


In [13]:
# 3.1 Create Age Groups
bins = [0, 25, 45, 65, 100]
labels = ['young', 'middle_aged', 'senior', 'elderly']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

df['age_group'].value_counts()


age_group
middle_aged    2708
senior         1255
young            98
elderly          58
Name: count, dtype: int64

In [None]:
# 4.1 Encode Categorical Variables Step by Step
le_job = LabelEncoder()
df['job_encoded'] = le_job.fit_transform(df['job'])

le_education = LabelEncoder()
df['education_encoded'] = le_education.fit_transform(df['education'])

df.head()


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,age_group,job_encoded,education_encoded
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no,middle_aged,1,2
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no,middle_aged,7,3
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no,middle_aged,7,3
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no,middle_aged,7,2
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no,senior,0,6


In [16]:
# 5.1 Prepare Features
features = ['age', 'job_encoded', 'education_encoded']
X = df[features].fillna(0)

# 5.2 K-means Clustering
kmeans = KMeans(n_clusters=4, random_state=42)
df['customer_segment'] = kmeans.fit_predict(X)

# 5.3 Analyze Customer Segments (CORRECTED)
print("Customer Segment Analysis:")
print("=" * 40)

# Numeric analysis
numeric_analysis = df.groupby('customer_segment').agg({
    'age': ['mean', 'count']
}).round(1)

# Most common job per segment
job_analysis = df.groupby('customer_segment')['job'].agg(
    lambda x: x.mode().iloc[0] if not x.mode().empty else 'unknown'
)

print("Segment sizes and average age:")
for segment in sorted(df['customer_segment'].unique()):
    size = len(df[df['customer_segment'] == segment])
    avg_age = df[df['customer_segment'] == segment]['age'].mean()
    common_job = job_analysis[segment]
    print(f"Segment {segment}: {size} customers, avg age {avg_age:.1f}, common job: {common_job}")



Customer Segment Analysis:
Segment sizes and average age:
Segment 0: 793 customers, avg age 56.2, common job: retired
Segment 1: 832 customers, avg age 31.4, common job: technician
Segment 2: 1239 customers, avg age 32.1, common job: admin.
Segment 3: 1255 customers, avg age 43.6, common job: blue-collar


In [17]:
# 6.1 Save to CSV for later phases
df.to_csv('../data/processed_data.csv', index=False)
print('Processed data saved.')


Processed data saved.


In [19]:
# Summarize key stats by segment (CORRECTED)
def get_mode(x):
    """Helper function to safely get mode value"""
    mode_result = x.mode()
    return mode_result.iloc[0] if not mode_result.empty else 'unknown'

# Group by customer segment and aggregate
segment_summary = df.groupby('customer_segment').agg({
    'age': 'mean',
    'job': get_mode,
    'education': get_mode
}).round(1)

display(segment_summary)

# Additional segment insights
print("\nSegment Size Distribution:")
print(df['customer_segment'].value_counts().sort_index())



Unnamed: 0_level_0,age,job,education
customer_segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,56.2,retired,university.degree
1,31.4,technician,high.school
2,32.1,admin.,university.degree
3,43.6,blue-collar,university.degree



Segment Size Distribution:
customer_segment
0     793
1     832
2    1239
3    1255
Name: count, dtype: int64
