In [1]:
import pandas as pd
import os
import joblib

In [2]:
PATH_TRAIN = os.path.join("dataset", "train.csv")
PATH_TEST = os.path.join("dataset", "test.csv")

In [3]:
df = pd.read_csv(PATH_TRAIN, index_col='id')


In [4]:
df
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26570 entries, 0 to 26569
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   product_code    26570 non-null  object 
 1   loading         26320 non-null  float64
 2   attribute_0     26570 non-null  object 
 3   attribute_1     26570 non-null  object 
 4   attribute_2     26570 non-null  int64  
 5   attribute_3     26570 non-null  int64  
 6   measurement_0   26570 non-null  int64  
 7   measurement_1   26570 non-null  int64  
 8   measurement_2   26570 non-null  int64  
 9   measurement_3   26189 non-null  float64
 10  measurement_4   26032 non-null  float64
 11  measurement_5   25894 non-null  float64
 12  measurement_6   25774 non-null  float64
 13  measurement_7   25633 non-null  float64
 14  measurement_8   25522 non-null  float64
 15  measurement_9   25343 non-null  float64
 16  measurement_10  25270 non-null  float64
 17  measurement_11  25102 non-null 

In [5]:
df_y = df["failure"]  # save target
df_X = df.drop(['failure'], axis=1)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
random_state = 42
test_size = 0.2

x_train, x_val, y_train, y_val = train_test_split(df_X, df_y , test_size = 
test_size, random_state=random_state)

In [8]:
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(21256, 24)
(21256,)
(5314, 24)
(5314,)


# Data Preprocessing

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [10]:
# missing values
x_train.isnull().sum()

product_code         0
loading            207
attribute_0          0
attribute_1          0
attribute_2          0
attribute_3          0
measurement_0        0
measurement_1        0
measurement_2        0
measurement_3      293
measurement_4      439
measurement_5      539
measurement_6      627
measurement_7      738
measurement_8      852
measurement_9      972
measurement_10    1042
measurement_11    1153
measurement_12    1315
measurement_13    1401
measurement_14    1496
measurement_15    1623
measurement_16    1699
measurement_17    1837
dtype: int64

In [11]:
for col in df_X.columns:
    if df_X[col].isnull().any():
        print(df[col].isnull())

id
0        False
1        False
2        False
3        False
4        False
         ...  
26565    False
26566    False
26567    False
26568    False
26569    False
Name: loading, Length: 26570, dtype: bool
id
0        False
1        False
2        False
3        False
4        False
         ...  
26565    False
26566    False
26567    False
26568    False
26569    False
Name: measurement_3, Length: 26570, dtype: bool
id
0        False
1        False
2        False
3        False
4        False
         ...  
26565    False
26566     True
26567    False
26568     True
26569    False
Name: measurement_4, Length: 26570, dtype: bool
id
0        False
1        False
2        False
3        False
4        False
         ...  
26565    False
26566    False
26567    False
26568    False
26569    False
Name: measurement_5, Length: 26570, dtype: bool
id
0        False
1        False
2        False
3        False
4        False
         ...  
26565    False
26566    False
26567    False
2656

In [12]:
# missing columns are numeric type
missing_cols = [col for col in df_X.columns if df_X[col].isnull().any()]
missing_cols

['loading',
 'measurement_3',
 'measurement_4',
 'measurement_5',
 'measurement_6',
 'measurement_7',
 'measurement_8',
 'measurement_9',
 'measurement_10',
 'measurement_11',
 'measurement_12',
 'measurement_13',
 'measurement_14',
 'measurement_15',
 'measurement_16',
 'measurement_17']

In [13]:
numerical_cols = [c for c in df_X.columns if df_X[c].dtypes in ['int', 'float']]
numerical_cols += [f'miss_{col}' for col in missing_cols]
print("Numerical Columns\n", numerical_cols)

categorical_cols = [c for c in df_X.columns if df_X[c].dtypes in ['object']]
print("\nCategorical Columns\n", categorical_cols)

Numerical Columns
 ['loading', 'attribute_2', 'attribute_3', 'measurement_0', 'measurement_1', 'measurement_2', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17', 'miss_loading', 'miss_measurement_3', 'miss_measurement_4', 'miss_measurement_5', 'miss_measurement_6', 'miss_measurement_7', 'miss_measurement_8', 'miss_measurement_9', 'miss_measurement_10', 'miss_measurement_11', 'miss_measurement_12', 'miss_measurement_13', 'miss_measurement_14', 'miss_measurement_15', 'miss_measurement_16', 'miss_measurement_17']

Categorical Columns
 ['product_code', 'attribute_0', 'attribute_1']


In [14]:
# Preprocessing for numerical data
# it was constant
numerical_transformer = Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),
                                        ('std_scaler', StandardScaler())
                                       ]) 


# # Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer([
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
])

In [15]:
def preprocess(df, preprocessor, missing_cols, is_train):
    # fill value: if column is null
    for col in missing_cols:
        df[f'miss_{col}'] = df[col].isnull()
    
    if is_train:
        df = preprocessor.fit_transform(df)
    else:
        df = preprocessor.transform(df)
    
    print(df.shape)
    
    return df

In [16]:
# x_train = preprocess(x_train, preprocessor,missing_cols, is_train=True)
# x_val = preprocess(x_val, preprocessor, missing_cols, is_train=False)

# Model

In [17]:
from catboost import CatBoostClassifier

In [18]:
x_train = preprocess(df_X, preprocessor, missing_cols, is_train=True)

(26570, 47)


In [19]:
model = CatBoostClassifier(iterations=5,verbose=5)
model.fit(x_train, df_y)

Learning rate set to 0.5
0:	learn: 0.5516983	total: 74.4ms	remaining: 669ms
5:	learn: 0.5038932	total: 106ms	remaining: 70.5ms
9:	learn: 0.4996733	total: 131ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7ff1a034f430>

In [20]:
# save model
PATH_MODEL = 'CatBoost'
joblib.dump(model, PATH_MODEL)

['CatBoost']

# Predict

In [21]:
df_test = pd.read_csv(PATH_TEST, index_col='id')

In [22]:
x_test = preprocess(df_test, preprocessor, missing_cols, is_train=False)

(20775, 47)


In [23]:
pred = model.predict_proba(x_test)

In [24]:
pred

array([[0.7965815 , 0.2034185 ],
       [0.83729651, 0.16270349],
       [0.82757961, 0.17242039],
       ...,
       [0.83178766, 0.16821234],
       [0.79876977, 0.20123023],
       [0.89125051, 0.10874949]])

In [25]:
submission = pd.DataFrame({'id': df_test.index,
                           'failure': pred[:,1]})
submission.to_csv('submission.csv', index=False)