In [6]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression

In [74]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [75]:
display(train.isnull().sum())
print('\r\n', '='*30, '\r\n')
display(test.isnull().sum())

id                   0
product_code         0
loading            250
attribute_0          0
attribute_1          0
attribute_2          0
attribute_3          0
measurement_0        0
measurement_1        0
measurement_2        0
measurement_3      381
measurement_4      538
measurement_5      676
measurement_6      796
measurement_7      937
measurement_8     1048
measurement_9     1227
measurement_10    1300
measurement_11    1468
measurement_12    1601
measurement_13    1774
measurement_14    1874
measurement_15    2009
measurement_16    2110
measurement_17    2284
failure              0
dtype: int64





id                   0
product_code         0
loading            223
attribute_0          0
attribute_1          0
attribute_2          0
attribute_3          0
measurement_0        0
measurement_1        0
measurement_2        0
measurement_3      329
measurement_4      409
measurement_5      508
measurement_6      624
measurement_7      720
measurement_8      846
measurement_9      904
measurement_10    1067
measurement_11    1136
measurement_12    1240
measurement_13    1303
measurement_14    1440
measurement_15    1542
measurement_16    1678
measurement_17    1740
dtype: int64

In [76]:
def fill_missing(df):
    numerics = ['int16', 'int32', 'int64','float16','float32', 'float64']
    for col in df.select_dtypes(include=numerics):
        df[col] = df[col].fillna(value = df[col].mean())
    
    for col in df.select_dtypes(exclude=numerics):
        df[col] = df[col].fillna(value = df[col].mode())
    return df

train = fill_missing(train)
test = fill_missing(test)

In [77]:
train = train.drop(['product_code', 'attribute_0', 'attribute_1', 'attribute_2', 'attribute_3'], axis=1)
test = test.drop(['product_code', 'attribute_0', 'attribute_1', 'attribute_2', 'attribute_3'], axis=1)

In [78]:
weight = 1. / (train.failure.value_counts().values / train.shape[0])

In [79]:
weight

array([1.27001577, 4.70348734])

In [80]:
x_data = train.drop(['id', 'failure'], axis=1)
y_data = train.failure

x_test = test.drop('id', axis=1)

In [81]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)
x_test = scaler.transform(x_test)

In [82]:
from sklearn.decomposition import PCA

pca = PCA(n_components=9)
x_data = pca.fit_transform(x_data)
x_test = pca.transform(x_test)

In [83]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2)

In [84]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import BCELoss

import torch.nn.functional as F

In [85]:
x_train = torch.tensor(x_train, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)

x_val = torch.tensor(x_val, dtype=torch.float32)
y_val = torch.tensor(y_val.values, dtype=torch.float32)

In [86]:
from torch.utils.data import WeightedRandomSampler

sample_weight = weight[y_train.detach().numpy().astype('int')]
sampler = WeightedRandomSampler(weights=sample_weight, num_samples=len(sample_weight), replacement=True)

In [87]:
train_datasets = TensorDataset(x_train, y_train)
train_dataloader = DataLoader(train_datasets, batch_size=100, sampler=sampler)

In [105]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.fc1 = nn.Linear(9, 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 1)
        
        self.bn1 = nn.BatchNorm1d(100)
        self.bn2 = nn.BatchNorm1d(50)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.fc2(x)))
        x = F.sigmoid(self.fc3(x))
        
        return x

In [106]:
model = Net()

In [107]:
optimizer = Adam(params=model.parameters(), lr=0.01, weight_decay=0.0001)
loss_func = BCELoss()

In [108]:
def weight_init(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)

In [109]:
model.apply(weight_init)

Net(
  (fc1): Linear(in_features=9, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=1, bias=True)
  (bn1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [110]:
def train(model, train_data, epochs):
    for i in range(epochs):
        loss_sum = 0
        for x, label in train_dataloader:
            loss = loss_func(model(x), label.view(-1,1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_sum += loss.item()

        if i % 10 == 0:
            print('loss :', loss_sum / len(train_dataloader))

In [111]:
train(model, train_dataloader, 1000)



loss : 0.696009315235514
loss : 0.6826769040783806
loss : 0.6824953516884029
loss : 0.6795942346814653
loss : 0.6772116736067293
loss : 0.6739032749838673
loss : 0.677945936509701
loss : 0.6743779168442381
loss : 0.6733243336699938
loss : 0.6753055405728694
loss : 0.672560647619722
loss : 0.6733029342033494
loss : 0.6734298400475945
loss : 0.6732962111911863
loss : 0.6710185064396388
loss : 0.6704079805405487
loss : 0.6720888256467005
loss : 0.6745074282229786
loss : 0.6751342541734937
loss : 0.6673564715004863
loss : 0.6699487323492346
loss : 0.6696269422629629
loss : 0.671467533973461
loss : 0.6692697417008485
loss : 0.6717795547745038
loss : 0.6688743759768669
loss : 0.6665067644746091
loss : 0.6687556460429805
loss : 0.6712715863621851
loss : 0.6702216466267904
loss : 0.6695726592216134
loss : 0.6700936569854128
loss : 0.6675869453121239
loss : 0.66949158095418
loss : 0.6702520640243387
loss : 0.6702058180956774
loss : 0.6694429718272786
loss : 0.6680102530219745
loss : 0.670595014

In [112]:
y_train_pred = model(x_train).detach().numpy()
y_train = y_train.detach().numpy()

y_val_pred = model(x_val).detach().numpy()
y_val = y_val.detach().numpy()

In [113]:
from sklearn.metrics import roc_auc_score

train_score = roc_auc_score(y_train, y_train_pred)
val_score = roc_auc_score(y_val, y_val_pred)

print('train_score:', train_score)
print('val_score', val_score)

train_score: 0.6364732904639538
val_score 0.5381308615214504


In [114]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Collecting tqdm<5.0,>=4.36
  Using cached tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
Installing collected packages: tqdm, pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1 tqdm-4.64.0
