In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import numpy as np
import pandas as pd
import torch
import math
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from xgboost import XGBClassifier, XGBRegressor, XGBRFClassifier
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [10]:
test_df = pd.read_csv(
    "/content/drive/MyDrive/final project/test.csv/test.csv", index_col="id")

In [11]:
def data_process(data):
    # referred to this discussion
    # https://www.kaggle.com/competitions/tabular-playground-series-aug-2022/discussion/343368
    # checking missing values of M3 and M5
    data['withM3'] = data.measurement_3.isna()
    data['withM5'] = data.measurement_5.isna()
    data['withM3&M5'] = data['withM3'] * data['withM5']  
    # referred to
    # https://www.kaggle.com/code/samuelcortinhas/tps-aug-22-failure-prediction
    # do one-hot encoding
    materials = ['attribute_0', 'attribute_1']
    for col in materials:
        dummies = pd.get_dummies(data[col], prefix=col)
        data = data.merge(dummies, left_index=True, right_index=True)
    data = data.drop(materials, axis=1)
    # Drop one of the binary one-hot columns - cf 'dummy variable trap'
    data = data.drop('attribute_1_material_7', axis=1)
    # product's width and height
    data['2*3'] = data['attribute_2'] * data['attribute_3']
    data = data.drop(['attribute_2', 'attribute_3'], axis=1)
    # fill loading and measurement's nan  with IterativeImputer:
    # impute by product_code
    features = [
        f for f in data.columns if f.startswith('measurement') or f == 'loading']
    frames = []
    for code in data.product_code.unique():
        df = data[data.product_code==code].copy()
        imputer = IterativeImputer(max_iter=50, random_state=0, skip_complete=True, n_nearest_features=12)
        imputer.fit(df[features])
        df[features] = imputer.transform(df[features])
        frames.append(df)
    data = pd.concat(frames)
    # code from
    # https://www.kaggle.com/code/desalegngeb/tps08-logisticregression-and-some-fe
    data['measurement_avg'] = data[
        [f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    data = data.drop(
        [f'measurement_{i}' for i in range(3, 17)], axis=1)
    # scale data to use logistic regression
    scaler = StandardScaler()
    columns = [a for a in data.columns 
               if a not in ['product_code', 'isTrain', 'withM3', 'withM5']]
    data[columns] = scaler.fit_transform(data[columns])
    return data


test_df = data_process(test_df)



In [12]:
model = torch.load("/content/drive/MyDrive/final project/model")

In [13]:
predictions = model.predict_proba(test_df.drop('product_code', axis=1))[:, 1]

In [14]:
submission = pd.read_csv(
    '/content/drive/MyDrive/final project/sample_submission.csv', index_col='id')
submission['failure'] = predictions
submission

Unnamed: 0_level_0,failure
id,Unnamed: 1_level_1
26570,0.452214
26571,0.414097
26572,0.431080
26573,0.435558
26574,0.640442
...,...
47340,0.553326
47341,0.388574
47342,0.387117
47343,0.501844


In [16]:
submission.to_csv("submission.csv")