In [5]:
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [65]:
diamonds = sns.load_dataset("diamonds")

# Split data into features and target
x = diamonds.drop("cut", axis=1)
y = diamonds["cut"]


In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=42)

In [25]:
x_train.head()
x_train.dtypes

carat       float64
color      category
clarity    category
depth       float64
table       float64
price         int64
x           float64
y           float64
z           float64
dtype: object

In [31]:
catrgorical_features = x.select_dtypes(
    include = ['category']
).columns.tolist() # converts list.

In [33]:
catrgorical_features

['color', 'clarity']

In [35]:
numerical_features = X.select_dtypes(
   include=["float64", "int64"]
).columns.tolist()

In [37]:
numerical_features

['carat', 'depth', 'table', 'price', 'x', 'y', 'z']

In [47]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder() , catrgorical_features),
        ('num', StandardScaler(), numerical_features),
    ]
)

In [49]:
pipeline = Pipeline(
   [
       ("preprocessor", preprocessor),
       ("classifier", GradientBoostingClassifier(random_state=42)),
   ]
)

In [53]:
cv_scores = cross_val_score(pipeline, x_train, y_train, cv=5)

In [54]:
pipeline.fit(x_train, y_train)


In [55]:
y_pred = pipeline.predict(x_test)

In [56]:
report = classification_report(y_test, y_pred)

In [57]:
print(f"Mean Cross-Validation Accuracy: {cv_scores.mean():.4f}")
print("\nClassification Report:")
print(report)

Mean Cross-Validation Accuracy: 0.7597

Classification Report:
              precision    recall  f1-score   support

        Fair       0.89      0.91      0.90       335
        Good       0.81      0.64      0.71      1004
       Ideal       0.82      0.91      0.86      4292
     Premium       0.69      0.86      0.77      2775
   Very Good       0.66      0.40      0.50      2382

    accuracy                           0.76     10788
   macro avg       0.78      0.74      0.75     10788
weighted avg       0.75      0.76      0.74     10788

