In [1]:
#Könyvtár behívása
import pandas as pd

In [5]:
#adathalmaz beolvasása
ah = pd.read_csv('bank-full.csv', sep=';')

In [None]:
print(f"Sorok: {len(ah)}")

Sorok: 45211


In [9]:
print(f"Változók (oszlopok száma): {len(ah.columns)}")
print(ah.head())
print(ah.describe())
print(ah['job'].value_counts())
print(ah['balance'].describe())

Változók (oszlopok száma): 18
   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  \
0  unknown    5   may       261         1     -1         0  unknown  no   
1  unknown    5   may       151         1     -1         0  unknown  no   
2  unknown    5   may        76         1     -1         0  unknown  no   
3  unknown    5   may        92         1     -1         0  unknown  no   
4  unknown    5   may       198         1     -1         0  unknown  no   

   cluster  
0        1  
1        3  
2        0  
3        3  
4        

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

X = ah[['age', 'balance']].copy()
X_scaled = StandardScaler().fit_transform(X)

kmeans = KMeans(n_clusters=4, random_state=42)
ah['cluster'] = kmeans.fit_predict(X_scaled)

print(ah.groupby('cluster')[['age', 'balance']].mean())

               age       balance
cluster                         
0        31.178757    859.105614
1        56.220818   1325.483730
2        43.057663  14019.775417
3        42.357469    905.164491


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Create scatter plot of age vs balance colored by cluster
plt.figure(figsize=(12, 8))

# Plot each cluster with different colors
colors = ['red', 'blue', 'green', 'orange']
cluster_labels = ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3']

for i in range(4):
    cluster_data = ah[ah['cluster'] == i]
    plt.scatter(cluster_data['age'], cluster_data['balance'], 
                c=colors[i], label=cluster_labels[i], alpha=0.6, s=30)

# Get cluster centroids in original scale (not scaled)
centroids_scaled = kmeans.cluster_centers_
scaler = StandardScaler()
scaler.fit(ah[['age', 'balance']])
centroids = scaler.inverse_transform(centroids_scaled)

# Plot centroids
plt.scatter(centroids[:, 0], centroids[:, 1], 
            c='black', marker='X', s=300, edgecolors='white', linewidths=2,
            label='Centroids', zorder=5)

# Add centroid labels
for i, (x, y) in enumerate(centroids):
    plt.annotate(f'C{i}', (x, y), fontsize=12, fontweight='bold', 
                ha='center', va='center', color='white')

plt.xlabel('Age', fontsize=12)
plt.ylabel('Balance', fontsize=12)
plt.title('Customer Segmentation: Age vs Balance (K-Means Clustering)', fontsize=14, fontweight='bold')
plt.legend(loc='upper right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Print centroid information
print("\nCluster Centroids (Original Scale):")
print(f"{'Cluster':<10} {'Age':<10} {'Balance':<12}")
print("-" * 35)
for i, (age, balance) in enumerate(centroids):
    print(f"Cluster {i:<3} {age:>7.2f} {balance:>12.2f}")