In [55]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s4e3/sample_submission.csv
/kaggle/input/playground-series-s4e3/train.csv
/kaggle/input/playground-series-s4e3/test.csv


In [56]:
# View the data
train_data = pd.read_csv('/kaggle/input/playground-series-s4e3/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e3/test.csv')

train_data.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,584,590,909972,909977,16,8,5,2274,113,...,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,1,808,816,728350,728372,433,20,54,44478,70,...,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,2,39,192,2212076,2212144,11388,705,420,1311391,29,...,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,3,781,789,3353146,3353173,210,16,29,3202,114,...,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,4,1540,1560,618457,618502,521,72,67,48231,82,...,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


In [68]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [72]:
class config:
    root = '/kaggle/input/playground-series-s4e3'
    seed = 42
    num_fold = 5
    num_class=7
    targets = ['Pastry','Z_Scratch','K_Scatch',
               'Stains','Dirtiness','Bumps','Other_Faults']

In [58]:
# Checking null values ============================================================
# train_data.isnull().sum()
# test_data.isnull().sum()
# There are no null values within the dataset so we will not need to deal with them

# Checking for duplicates =========================================================
# sum(train_data.drop(columns='id').duplicated())
# sum(test_data.drop(columns='id').duplicated())
# There are no duplicated rows in the dataset so there is no need to deal with them


for column in train_data.columns:
    unique_values = len(train_data[column].unique())
    max_val = train_data[column].max()
    min_val = train_data[column].min()
    
    print(f"Unique values in column '{column}': {unique_values}. range({min_val} - {max_val})")

Unique values in column 'id': 19219. range(0 - 19218)
Unique values in column 'X_Minimum': 1191. range(0 - 1705)
Unique values in column 'X_Maximum': 1259. range(4 - 1713)
Unique values in column 'Y_Minimum': 3345. range(6712 - 12987661)
Unique values in column 'Y_Maximum': 3341. range(6724 - 12987692)
Unique values in column 'Pixels_Areas': 1154. range(6 - 152655)
Unique values in column 'X_Perimeter': 460. range(2 - 7553)
Unique values in column 'Y_Perimeter': 331. range(1 - 903)
Unique values in column 'Sum_of_Luminosity': 2595. range(250 - 11591414)
Unique values in column 'Minimum_of_Luminosity': 162. range(0 - 196)
Unique values in column 'Maximum_of_Luminosity': 98. range(39 - 253)
Unique values in column 'Length_of_Conveyer': 99. range(1227 - 1794)
Unique values in column 'TypeOfSteel_A300': 2. range(0 - 1)
Unique values in column 'TypeOfSteel_A400': 2. range(0 - 1)
Unique values in column 'Steel_Plate_Thickness': 27. range(40 - 300)
Unique values in column 'Edges_Index': 1849.

In [59]:
# This is start of data pre-processing, you need to investigate the data first

train_data = train_data.drop(columns='id')
test_data = test_data.drop(columns='id')

target_columns = ['Pastry', 'Z_Scratch', 'K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
train_target = train_data[target_columns].copy()
train_data = train_data.drop(columns=target_columns)

x_train, x_test, y_train, y_test = train_test_split(train_data, train_target, test_size=0.2, random_state=42)

print('x_train: ', x_train.shape)
print('x_test: ', x_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

x_train:  (15375, 27)
x_test:  (3844, 27)
y_train:  (15375, 7)
y_test:  (3844, 7)


In [69]:
# Creation of stratifed(maintains class proportions) cross validation 
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

# Creation of XGBoost classifier
model = xgb.XGBClassifier(seed=42,
                          objective='binary:logistic',
                          eval_metric='logloss')

#
preprocessor = ColumnTransformer(
    transformers=[
        
        #('num', StandardScaler(), list(df_numeric.columns)),
        #('nom', OneHotEncoder(), encode_col),
    ],
    remainder='passthrough'  # Preserve other columns not specified
)
## Building pipeline
pipeline = Pipeline([
    #('preprocessor', preprocessor),
    ('classifier', model )
])

In [73]:
auc_scores = []
for train_idx, val_idx in cv.split(x_train, y_train.iloc[:,0]):
    X_train_fold, X_val_fold = x_train.iloc[train_idx], x_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    #X_train_fold = preprocessor.fit_transform(X_train_fold, y_train_fold)
    #X_val_fold = preprocessor.transform(X_val_fold)
    eval_set = [(X_val_fold, y_val_fold)]
    pipeline.fit(X_train_fold, y_train_fold)


    # Predict on the validation set using the best iteration
    y_pred = model.predict_proba(X_val_fold)

    for i in range(len(config.targets)):
        y_score = roc_auc_score(y_val_fold.iloc[:, i], y_pred[:, i])
        auc_scores.append(y_score)

    avg_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
print(avg_auc)


0.871407162465879


In [75]:
import optuna

# Define the objective function to optimize
def objective(trial, X, y, cv):
    model = XGBClassifier(
        seed=config.seed,
        objective='binary:logistic',
        eval_metric='logloss',


        max_depth=trial.suggest_int('max_depth', 3, 15),
        min_child_weight=trial.suggest_int('min_child_weight', 1, 10),
        n_estimators=trial.suggest_int('n_estimators', 30, 1500),
        subsample=trial.suggest_float('subsample', 0.05, 1.0),
        gamma=trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.05, 1.0),
        reg_lambda=trial.suggest_float('reg_lambda', 1e-4, 100, log=True),
        reg_alpha=trial.suggest_float('reg_alpha', 1e-4, 100, log=True),
        learning_rate=trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
        grow_policy = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
        
    )

    pipeline = Pipeline([
        #('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Evaluate the model using cross-validation
    auc_scores = []
    for train_idx, val_idx in cv.split(X, y.iloc[:,0]):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        
        pipeline.fit(X_train_fold, y_train_fold)

        y_pred = model.predict_proba(X_val_fold)
        y_score = np.zeros(7)
        
        for i in range(len(config.targets)):
            y_score[i] = roc_auc_score(y_val_fold.iloc[:, i], y_pred[:, i])
            auc_scores.append(np.mean(y_score[i]))

    avg_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    
    print(f'Average valid cv AUC: {avg_auc}+-{std_auc}')
    
    return avg_auc

In [77]:
best_params = {'max_depth': 8, 'min_child_weight': 10, 'n_estimators': 922,
               'subsample': 0.9469112904217117, 'gamma': 1.3630585059097634e-06,
               'colsample_bytree': 0.3116083899986368, 'reg_lambda': 0.018064066341669686,
               'reg_alpha': 0.5989399171181504, 'learning_rate': 0.0058073068193923005, 
               'grow_policy': 'lossguide'}
best_model = xgb.XGBClassifier(seed=config.seed,
                               objective='binary:logistic',
                               eval_metric='logloss',
                              **best_params)
preprocessor = ColumnTransformer(
    transformers=[
        #('num', StandardScaler(), list(df_numeric.columns)),
        #('nom', OneHotEncoder(), encode_col),
    ],
    remainder='passthrough'  # Preserve other columns not specified
)
opt_pipeline = Pipeline([
        #('preprocessor', preprocessor),
        ('classifier', best_model)
    ])

opt_pipeline.fit(x_train, y_train)

In [78]:
df_test = pd.read_csv(os.path.join(config.root,'test.csv'),index_col=0)
X_test = df_test.copy()
y_test = opt_pipeline.predict_proba(X_test)
result = pd.read_csv(os.path.join(config.root,'sample_submission.csv'))
result[config.targets] = y_test
result.head()

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.401414,0.004762,0.008311,0.001141,0.024224,0.142084,0.478875
1,19220,0.298041,0.033706,0.006911,0.001157,0.12803,0.187572,0.315969
2,19221,0.003398,0.039123,0.04636,0.001602,0.007329,0.283642,0.45037
3,19222,0.109244,0.011799,0.002031,0.003109,0.013618,0.393518,0.447191
4,19223,0.003994,0.01319,0.002861,0.002766,0.010694,0.598851,0.294611


In [80]:
result.to_csv('submission.csv',index=False)