In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
target = ["Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"]
y = train_data[target]
train_data = train_data.drop(target, axis=1)
train_data.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,0,584,590,909972,909977,16,8,5,2274,113,...,0.0059,1.0,1.0,0.0,1.2041,0.9031,0.699,-0.5,-0.0104,0.1417
1,1,808,816,728350,728372,433,20,54,44478,70,...,0.0044,0.25,1.0,1.0,2.6365,0.7782,1.7324,0.7419,-0.2997,0.9491
2,2,39,192,2212076,2212144,11388,705,420,1311391,29,...,0.1077,0.2363,0.3857,0.0,4.0564,2.179,2.2095,-0.0105,-0.0944,1.0
3,3,781,789,3353146,3353173,210,16,29,3202,114,...,0.0044,0.375,0.931,1.0,2.3222,0.7782,1.4314,0.6667,-0.0402,0.4025
4,4,1540,1560,618457,618502,521,72,67,48231,82,...,0.0192,0.2105,0.9861,1.0,2.7694,1.415,1.8808,0.9158,-0.2455,0.9998


In [3]:
train_data.isnull().sum()

id                       0
X_Minimum                0
X_Maximum                0
Y_Minimum                0
Y_Maximum                0
Pixels_Areas             0
X_Perimeter              0
Y_Perimeter              0
Sum_of_Luminosity        0
Minimum_of_Luminosity    0
Maximum_of_Luminosity    0
Length_of_Conveyer       0
TypeOfSteel_A300         0
TypeOfSteel_A400         0
Steel_Plate_Thickness    0
Edges_Index              0
Empty_Index              0
Square_Index             0
Outside_X_Index          0
Edges_X_Index            0
Edges_Y_Index            0
Outside_Global_Index     0
LogOfAreas               0
Log_X_Index              0
Log_Y_Index              0
Orientation_Index        0
Luminosity_Index         0
SigmoidOfAreas           0
dtype: int64

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19219 entries, 0 to 19218
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     19219 non-null  int64  
 1   X_Minimum              19219 non-null  int64  
 2   X_Maximum              19219 non-null  int64  
 3   Y_Minimum              19219 non-null  int64  
 4   Y_Maximum              19219 non-null  int64  
 5   Pixels_Areas           19219 non-null  int64  
 6   X_Perimeter            19219 non-null  int64  
 7   Y_Perimeter            19219 non-null  int64  
 8   Sum_of_Luminosity      19219 non-null  int64  
 9   Minimum_of_Luminosity  19219 non-null  int64  
 10  Maximum_of_Luminosity  19219 non-null  int64  
 11  Length_of_Conveyer     19219 non-null  int64  
 12  TypeOfSteel_A300       19219 non-null  int64  
 13  TypeOfSteel_A400       19219 non-null  int64  
 14  Steel_Plate_Thickness  19219 non-null  int64  
 15  Ed

In [5]:
train_data["X_range"] = train_data["X_Maximum"] - train_data["X_Minimum"]
test_data["X_range"] = test_data["X_Maximum"] - test_data["X_Minimum"]
train_data["Y_range"] = train_data["Y_Maximum"] - train_data["Y_Minimum"]
test_data["Y_range"] = test_data["Y_Maximum"] - test_data["Y_Minimum"]

train_data['Area_Perimeter_Ratio'] = train_data["Pixels_Areas"] / (
        train_data["X_Perimeter"] + train_data["Y_Perimeter"])
test_data['Area_Perimeter_Ratio'] = test_data["Pixels_Areas"] / (test_data["X_Perimeter"] + test_data["Y_Perimeter"])

train_data['Luminosity_Range'] = (train_data['Maximum_of_Luminosity'] - train_data['Minimum_of_Luminosity'])
test_data['Luminosity_Range'] = (test_data['Maximum_of_Luminosity'] - test_data['Minimum_of_Luminosity'])



In [6]:
train_data.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,X_range,Y_range,Area_Perimeter_Ratio,Luminosity_Range
0,0,584,590,909972,909977,16,8,5,2274,113,...,1.2041,0.9031,0.699,-0.5,-0.0104,0.1417,6,5,1.230769,27
1,1,808,816,728350,728372,433,20,54,44478,70,...,2.6365,0.7782,1.7324,0.7419,-0.2997,0.9491,8,22,5.851351,41
2,2,39,192,2212076,2212144,11388,705,420,1311391,29,...,4.0564,2.179,2.2095,-0.0105,-0.0944,1.0,153,68,10.122667,112
3,3,781,789,3353146,3353173,210,16,29,3202,114,...,2.3222,0.7782,1.4314,0.6667,-0.0402,0.4025,8,27,4.666667,20
4,4,1540,1560,618457,618502,521,72,67,48231,82,...,2.7694,1.415,1.8808,0.9158,-0.2455,0.9998,20,45,3.748201,29


In [7]:
X = train_data.copy()
scaler = StandardScaler()
X = scaler.fit_transform(X)  
xgb = XGBClassifier(n_estimators=300, max_depth=6, verbosity=0)
clf = OneVsRestClassifier(xgb)
score = cross_val_score(clf, X, y, cv=5, scoring="roc_auc_ovr")
print(score)


[0.86485162 0.87031039 0.86711192 0.8685383  0.862307  ]


In [8]:
'''param_grid = {
    'estimator__n_estimators': [100, 200, 300],
    'estimator__max_depth': [3, 6, 9],
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__subsample': [0.8, 1.0],
    'estimator__colsample_bytree': [0.8, 1.0],
    'estimator__gamma': [0, 0.1, 0.2],
    'estimator__reg_alpha': [0, 0.1, 0.2],
    'estimator__reg_lambda': [0, 0.1, 0.2],
}

# Create an XGBClassifier
xgb = XGBClassifier(verbosity=0)

# Create a OneVsRestClassifier
clf = OneVsRestClassifier(xgb)

# Create a GridSearchCV
grid = GridSearchCV(estimator=clf, param_grid=param_grid, scoring="roc_auc", cv=3, verbose=2, n_jobs=10)

# Fit the GridSearchCV
grid.fit(X, y)

# Print the best parameters
print(grid.best_params_)'''

'param_grid = {\n    \'estimator__n_estimators\': [100, 200, 300],\n    \'estimator__max_depth\': [3, 6, 9],\n    \'estimator__learning_rate\': [0.01, 0.1, 0.2],\n    \'estimator__subsample\': [0.8, 1.0],\n    \'estimator__colsample_bytree\': [0.8, 1.0],\n    \'estimator__gamma\': [0, 0.1, 0.2],\n    \'estimator__reg_alpha\': [0, 0.1, 0.2],\n    \'estimator__reg_lambda\': [0, 0.1, 0.2],\n}\n\n# Create an XGBClassifier\nxgb = XGBClassifier(verbosity=0)\n\n# Create a OneVsRestClassifier\nclf = OneVsRestClassifier(xgb)\n\n# Create a GridSearchCV\ngrid = GridSearchCV(estimator=clf, param_grid=param_grid, scoring="roc_auc", cv=3, verbose=2, n_jobs=10)\n\n# Fit the GridSearchCV\ngrid.fit(X, y)\n\n# Print the best parameters\nprint(grid.best_params_)'

In [9]:
 best_cls_params = {'grow_policy': 'depthwise',
                    'n_estimators': 785,
                    'learning_rate': 0.020726270353596147,
                    'gamma': 0.13894214878891328,
                    'subsample': 0.6931112900708679,
                    'colsample_bytree': 0.5210477365658035,
                    'max_depth': 3,
                    'min_child_weight': 2,
                    'reg_lambda': 5.369876648869387e-06,
                    'reg_alpha': 8.428323563897863e-09}
xgb = XGBClassifier(**best_cls_params)
clf = OneVsRestClassifier(xgb)
score = cross_val_score(clf, X, y, cv=5, scoring="roc_auc_ovr")
print(score)

[0.8869331  0.89126686 0.88473382 0.89040977 0.88574903]


In [10]:
np.mean(score)

0.8878185174472977

In [11]:
xgb = XGBClassifier(**best_cls_params)
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)
xgb.fit(train_data,y)
predictions = xgb.predict_proba(test_data)
print(predictions)
submission = pd.read_csv('data/sample_submission.csv')
submission_df = pd.DataFrame(columns=['id', "Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"])
submission_df['id'] = submission["id"]
submission_df[["Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"]] = predictions
submission_df.to_csv('submissions.csv', header=True, index=False)
submission_df.head(20)


[[5.2001011e-01 5.8001874e-04 2.7838119e-03 ... 2.5342410e-02
  1.4834325e-01 5.1841563e-01]
 [2.2203869e-01 1.2121021e-02 8.8221552e-03 ... 2.2649707e-01
  1.2806138e-01 3.7941805e-01]
 [2.1594616e-03 3.4647740e-02 4.3529429e-02 ... 1.1907827e-02
  2.6744342e-01 5.0725114e-01]
 ...
 [3.4989812e-04 5.9815746e-04 9.1534364e-01 ... 1.9070684e-03
  6.7149312e-04 8.8298582e-02]
 [3.8732669e-01 8.2297130e-03 1.6319524e-02 ... 7.2407223e-02
  1.4085139e-01 4.3508524e-01]
 [1.3691619e-03 4.7467584e-03 8.6935389e-01 ... 9.7407954e-04
  8.6544064e-04 9.4340563e-02]]


Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.52001,0.00058,0.002784,2.3e-05,0.025342,0.148343,0.518416
1,19220,0.222039,0.012121,0.008822,0.000152,0.226497,0.128061,0.379418
2,19221,0.002159,0.034648,0.043529,0.000145,0.011908,0.267443,0.507251
3,19222,0.219907,0.000724,0.000464,0.00241,0.029838,0.293024,0.460437
4,19223,0.001741,0.000985,0.000859,0.006861,0.01006,0.588758,0.418237
5,19224,0.102078,0.287856,0.003961,2.9e-05,0.020337,0.11073,0.320274
6,19225,0.392781,0.000577,0.002281,3.5e-05,0.027709,0.062204,0.669437
7,19226,0.012998,0.203038,0.005004,0.00675,0.05724,0.176735,0.481746
8,19227,0.000553,0.195148,0.195336,0.001164,0.005901,0.129631,0.356586
9,19228,0.320727,0.024617,0.000777,2.7e-05,0.00297,0.195897,0.512951
