In [None]:
# Import required libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
import numpy as np

# Step 1: Load the dataset
url = "/content/drive/MyDrive/TY_sem5/bml_pract/adult.data"

# Define column names as per UCI documentation
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'income'
]

df = pd.read_csv(url, header=None, names=columns, na_values=' ?')

# Step 2: Data cleaning
df.dropna(inplace=True)  # Remove rows with missing values

# Step 3: Encode categorical variables
le = LabelEncoder()
for col in df.select_dtypes('object').columns:
    df[col] = le.fit_transform(df[col])

# Step 4: Separate features and target
X = df.drop('income', axis=1)
y = df['income']

# Step 5: Standardize the feature values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: Apply K-Means clustering (2 clusters because income has 2 classes)
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Step 7: Adjust labels (KMeans cluster IDs donâ€™t correspond directly to income labels)
# Find the best mapping between clusters and actual labels
labels = np.zeros_like(clusters)
for i in range(2):
    mask = (clusters == i)
    labels[mask] = np.bincount(y[mask]).argmax()

# Step 8: Compute accuracy
accuracy = accuracy_score(y, labels)
print(f"K-Means Clustering Accuracy: {accuracy * 100:.2f}%")


K-Means Clustering Accuracy: 75.11%
