In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import roc_auc_score

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from catboost import CatBoostClassifier
from sklearn.multioutput import MultiOutputClassifier

In [3]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [4]:
# Split the data into features and target
target_features = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

In [5]:
# Split the data into features and target
X_train = train.drop(['id'] + target_features, axis=1)
X_test = test.drop(['id'], axis=1)

y_train = train[target_features].values

In [6]:
# Define features
features = X_train.columns

# Create a column transformer for one-hot encoding and standard scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), features)
    ]
)

# Create a pipeline with the preprocessor and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', MultiOutputClassifier(CatBoostClassifier(random_seed=42, verbose=0)))
])


In [7]:
# Fit the model
pipeline.fit(X_train, y_train)

# make probability predictions
preds = pipeline.predict_proba(X_test)

In [24]:
preds[0][1]

array([0.87555497, 0.12444503])

In [25]:
# If `preds` is a list of arrays (for models like RandomForest), we need to extract the positive class probabilities (class=1).
# If your model returns a single array, this step can be skipped.
if isinstance(preds, list):
    preds = np.column_stack([p[:, 1] for p in preds])

In [28]:
# Calculate the RMSLE
train_preds = pipeline.predict(X_train)

In [29]:
# Calculate the AUC score for each of the 7 classes
auc_scores = []
for i, col in enumerate(target_features):
    # For Random Forest or any tree-based model, `predict_proba` returns a list of arrays
    # Extract the probabilities of the positive class (index=1) for each class
    if isinstance(preds, list):
        auc = roc_auc_score(y_train[:, i], train_preds[i][:, 1])
    else:
        auc = roc_auc_score(y_train[:, i], train_preds[:, i])
    auc_scores.append(auc)
    print(f"{col} - AUC={auc:.4f}")

# Final average AUC score
final_score = np.mean(auc_scores)
print(f"Final Average AUC={final_score:.4f}")

Pastry - AUC=0.7173
Z_Scratch - AUC=0.9104
K_Scatch - AUC=0.9693
Stains - AUC=0.9974
Dirtiness - AUC=0.7763
Bumps - AUC=0.7616
Other_Faults - AUC=0.7234
Final Average AUC=0.8365


In [26]:
# Prepare the submission dataframe
submission = pd.DataFrame({
    'id': test['id']
})

submission[target_features] = preds

In [27]:
# Save the submission file
submission.to_csv('../output/submission.csv', index=False)
print("Submission is successfully saved!")

Submission is successfully saved!
