In [697]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import cv2
import random
import pickle


## Data preprocess

In [698]:
# load data
TRAIN_PATH = './kaggle/input/tabular-playground-series-aug-2022/train.csv'
TEST_PATH = './kaggle/input/tabular-playground-series-aug-2022/test.csv'

df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)

df_train.head()


Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


In [699]:
# mappping the material_5 to 5 and material_7 to 7, etc.
df_train['attribute_0'] = df_train['attribute_0'].map({'material_5': 5, 'material_7': 7})
df_train['attribute_1'] = df_train['attribute_1'].map({'material_5': 5, 'material_6': 6, 'material_8': 8})
df_test['attribute_0'] = df_test['attribute_0'].map({'material_5': 5, 'material_7': 7})
df_test['attribute_1'] = df_test['attribute_1'].map({'material_5': 5, 'material_6': 6, 'material_7': 7})


In [700]:
# drop id column and failure column
df_train = df_train.drop(['id'], axis=1)
df_test = df_test.drop(['id'], axis=1)

x_train = df_train.drop(['failure'], axis=1)
y_train = df_train['failure']

x_test = df_test
print(x_train.shape)


(26570, 24)


In [701]:
# Modify features is the features that we want to modify (like scaling, etc.)
modify_features = ['loading']
for col in x_train.columns:
    if col.startswith('measurement'):
        modify_features.append(col)

print("modify_features: ", modify_features)


modify_features:  ['loading', 'measurement_0', 'measurement_1', 'measurement_2', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17']


In [702]:
# Missing value
missing_cols = [col for col in x_train.columns if x_train[col].isnull().any()]
print("missing_cols: ", missing_cols)

# record the number of missing values and specify measurement_3 and measurement_5
x_train['missing_3'] = x_train['measurement_3'].isnull()
x_train['missing_5'] = x_train['measurement_5'].isnull()
x_train['missing_num'] = x_train[missing_cols].isnull().sum(axis=1)

x_test['missing_3'] = x_test['measurement_3'].isnull()
x_test['missing_5'] = x_test['measurement_5'].isnull()
x_test['missing_num'] = x_test[missing_cols].isnull().sum(axis=1)


missing_cols:  ['loading', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17']


In [703]:
# Impute missing values
from sklearn.impute import SimpleImputer

def impute(train, test, features):
    imputer = SimpleImputer(strategy='median')
    train[features] = imputer.fit_transform(train[features])
    test[features] = imputer.transform(test[features])
    return train, test


In [704]:
# Area issue

def area(x):
    x['area'] = x['attribute_2'] * x['attribute_3']
    for col in modify_features:
        if col == 'loading':
            continue
        x[col] = x[col] / x['area']
    x = x.drop(['area'], axis=1)
    return x


In [705]:
# Feature scaling
from sklearn.preprocessing import PowerTransformer

def scale(train, test):

    pt = PowerTransformer()

    train[modify_features] = pt.fit_transform(train[modify_features])
    test[modify_features] = pt.transform(test[modify_features])

    return train, test


In [706]:
# One-hot encoding
def ohe(train, val):
    ohe_cols = ['product_code', 'attribute_0', 'attribute_1', 'attribute_2', 'attribute_3']

    tmp_train = pd.get_dummies(train, columns=ohe_cols)
    tmp_val = pd.get_dummies(val, columns=ohe_cols)

    train_val = pd.concat([tmp_train, tmp_val]).fillna(0)

    new_train = train_val.iloc[:len(train)]
    new_val = train_val.iloc[len(train):]
    return new_train, new_val


In [707]:
# Model: LogisticRegression
from sklearn.linear_model import LogisticRegression

model_lg = LogisticRegression(max_iter=5000, C=1e-3, solver='saga', penalty='elasticnet', l1_ratio=1e-3,
                              tol=1e-2, class_weight='balanced', n_jobs=-1, random_state=np.random.randint(1000))


##  Training

In [708]:
# Predict

x_train, x_test = impute(x_train, x_test, missing_cols)
x_train, x_test = area(x_train), area(x_test)
x_train, x_test = scale(x_train, x_test)
x_train, x_test = ohe(x_train, x_test)

# if you don't want to use ohe, you should drop product_code
# x_train.drop(['product_code'], axis=1, inplace=True)
# x_test.drop(['product_code'], axis=1, inplace=True)
# print(x_train.head())

model_lg.fit(x_train, y_train)

# dump model
pickle.dump(model_lg, open('model_lg.pkl', 'wb'))
pickle.dump(x_test, open('test_x.pkl', 'wb'))
