In [2]:
# Imports
import os
import numpy as np
import pandas as pd

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Scoring and Cross-Validation
from sklearn.metrics import roc_auc_score, accuracy_score

# Imputation and Preprocessing
from sklearn.preprocessing import RobustScaler
from category_encoders import WOEEncoder
from sklearn.impute import KNNImputer

# Pipeline Constructors
from sklearn.pipeline import make_pipeline

# Linear Models
from sklearn.linear_model import LogisticRegression, HuberRegressor

# save and load the model
from joblib import dump, load

In [4]:
def prepreprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])
    
    df_train = data.iloc[:df_train.shape[0],:]
    df_test = data.iloc[df_train.shape[0]:,:]
    
    # WOE Encoding
    encoder = WOEEncoder(cols = ['attribute_0'])
    df_train = encoder.fit_transform(df_train, target)
    df_test = encoder.transform(df_test)
    
    # Log transform
    df_train['loading'] = np.log1p(df_train['loading'])
    df_test['loading'] = np.log1p(df_test['loading'])
    
    df_train = df_train.drop(labels=["attribute_1"], axis="columns")
    df_test = df_test.drop(labels=["attribute_1"], axis="columns")
    
    feature_names = df_train.columns.values
    for feature in feature_names:
        if df_train[feature].isnull().values.any():
            df_train[feature] = df_train[feature].fillna(df_train[feature].median())
        if df_test[feature].isnull().values.any():
            df_test[feature] = df_test[feature].fillna(df_test[feature].median())
    
    return df_train, df_test, feature_names

In [9]:
# Load data and model
submission = pd.read_csv('/kaggle/input/final-project-dataset/sample_submission.csv')
train = pd.read_csv('/kaggle/input/final-project-dataset/train.csv', index_col = 'id') 
test = pd.read_csv('/kaggle/input/final-project-dataset/test.csv', index_col = 'id')
model = load('/kaggle/input/final-project-dataset/model.joblib')

# Save target column
target = train['failure'].copy()

train, test, columns = prepreprocessing(train, test)

In [10]:
SPLITS = []
indices = list(train.groupby("product_code").indices.values())
for i in range(len(indices)):
    for j in range(i+1, len(indices)):
        SPLITS.append([np.concatenate([ix for k, ix in enumerate(indices) if k not in [i, j]]),
                       np.concatenate([ix for k, ix in enumerate(indices) if k in [i, j]])])

In [11]:
# Stores predictions and scores
test_preds = np.zeros((test.shape[0],))
if "product_code" in test.columns:
    test = test.drop(labels=["product_code"], axis="columns")
columns = test.columns.values
# Training and Validation Splits
for fold, (train_idx, valid_idx) in enumerate(SPLITS):
    # Get predictions
    if test["failure"].isnull().values.any():
        test["failure"] = test["failure"].fillna(0)
    test_preds += model.predict_proba(test[columns])[:, 1] / len(SPLITS)
preds = test_preds
submission['failure'] = preds
submission.to_csv('submission.csv', index=False)
print('Sucessfully Saved')

Sucessfully Saved
