In [8]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report


In [9]:
# loading and cleaning the data
df = pd.read_csv("/content/beauty.csv")
df_split = df.iloc[:, 0].str.split(';', expand=True)
df_split.columns = ['wage', 'exper', 'union', 'goodhlth', 'black', 'female', 'married', 'service', 'educ', 'looks']
df_split = df_split.apply(pd.to_numeric)

In [10]:
# Step 2: Clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_split)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

In [11]:
df_split['cluster'] = clusters
X = df_split.drop('cluster', axis=1)
y = df_split['cluster']

In [12]:
df_split

Unnamed: 0,wage,exper,union,goodhlth,black,female,married,service,educ,looks,cluster
0,5.73,30,0,1,0,1,1,1,14,4,0
1,4.28,28,0,1,0,1,1,0,12,3,1
2,7.96,35,0,1,0,1,0,0,10,4,1
3,11.57,38,0,1,0,0,1,1,16,3,0
4,11.42,27,0,1,0,0,1,0,16,3,1
...,...,...,...,...,...,...,...,...,...,...,...
1255,1.61,25,0,1,1,1,0,1,12,3,2
1256,1.68,4,0,1,0,1,1,1,12,2,0
1257,3.29,35,0,1,1,1,0,1,12,3,2
1258,2.31,15,0,1,1,1,1,1,10,3,2


In [13]:
# Step 3: Classification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "LogisticRegression": (LogisticRegression(max_iter=1000), {
        'classifier__C': [0.1, 1, 10]
    }),
    "DecisionTree": (DecisionTreeClassifier(), {
        'classifier__max_depth': [3, 5, 10]
    }),
    "RandomForest": (RandomForestClassifier(), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [5, 10]
    }),
    "SVM": (SVC(), {
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    })
}

In [14]:
results = {}
for name, (model, params) in models.items():
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    grid = GridSearchCV(pipe, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    results[name] = {
        "best_params": grid.best_params_,
        "classification_report": classification_report(y_test, y_pred)
    }

# Print results
for model_name, result in results.items():
    print(f"\nModel: {model_name}")
    print("Best Params:", result["best_params"])
    print("Classification Report:\n", result["classification_report"])


Model: LogisticRegression
Best Params: {'classifier__C': 0.1}
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99        68
           1       1.00      0.99      1.00       167
           2       1.00      1.00      1.00        17

    accuracy                           1.00       252
   macro avg       1.00      1.00      1.00       252
weighted avg       1.00      1.00      1.00       252


Model: DecisionTree
Best Params: {'classifier__max_depth': 3}
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99        68
           1       1.00      0.99      1.00       167
           2       1.00      1.00      1.00        17

    accuracy                           1.00       252
   macro avg       1.00      1.00      1.00       252
weighted avg       1.00      1.00      1.00       252


Model: RandomForest
Best Params: {'classifier__max_depth': 5, '