In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.autograd import Variable
import csv

In [2]:
class mydataset(Dataset):

    def __init__(self, train_or_test):
        # 讀檔
        submission = pd.read_csv("sample_submission.csv")
        train = pd.read_csv("train.csv")
        test = pd.read_csv("test.csv")

        # 把 id, product_code, failure等, training用不到的column去掉
        train_x = train.drop(columns=["id","product_code", "failure"])
        test_x = test.drop(columns=["id","product_code"])

        # 把 failure從字串轉成數字
        train_y = train.iloc[:, -1]

        # 把 attribute的 "material_{number}" 轉成 "{number}"
        for i in range(len(train_x)):
            train_x.iat[i, 1] = train_x.iat[i, 1].split('_')[-1]
            train_x.iat[i, 2] = train_x.iat[i, 2].split('_')[-1]

        for i in range(len(test_x)):
            test_x.iat[i, 1] = test_x.iat[i, 1].split('_')[-1]
            test_x.iat[i, 2] = test_x.iat[i, 2].split('_')[-1]


        # 因為資料有10%的空缺，空缺的部分用 median的方式填補
        imp = SimpleImputer(missing_values=np.nan, strategy='median')
        
        if(train_or_test == "train"):
            train_x = train_x.astype({'attribute_0':'float', 'attribute_1':'float', 'attribute_2':'float', 'attribute_3':'float', 'measurement_0':'float', 'measurement_1':'float', 'measurement_2':'float'})
            train_imp = imp.fit(train_x)
            self.datas = train_imp.transform(train_x)
        if(train_or_test == "test"):
            test_x = test_x.astype({'attribute_0':'float', 'attribute_1':'float', 'attribute_2':'float', 'attribute_3':'float', 'measurement_0':'float', 'measurement_1':'float', 'measurement_2':'float'})
            test_imp = imp.fit(test_x)
            self.datas = test_imp.transform(test_x)
   
        self.train_y = train_y.astype({'failure':'float'})
        
    def __len__(self):
        return len(self.datas)
        
    def __getitem__(self, idx):
        data = self.datas[idx]
        label = np.zeros(1, dtype=float)
        label[0] = self.train_y[idx]
        return data, label

def train_data_loader():
    dataset = mydataset("train")
    return DataLoader(dataset, batch_size=150, shuffle=True)

def test_data_loader():
    dataset = mydataset("test")
    return DataLoader(dataset, batch_size=128)

In [3]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(23, 32)        
        self.fc2 = nn.Linear(32, 64)
        self.fc3 = nn.Linear(64, 1)
        self.leaky_relu = nn.LeakyReLU(0.1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.leaky_relu(out)
        out = self.fc2(out)
        out = self.leaky_relu(out)
        out = self.fc3(out)
        out = self.sigmoid(out)
        return out

In [6]:

device = "cuda" if torch.cuda.is_available() else "cpu"

model = torch.load('nn_model.pt', map_location=torch.device('cpu'))
model = model.to(device)
model.eval()

test_loader = test_data_loader()

answer = np.array([])
for i, (data, _) in enumerate(test_loader):  
    data = Variable(data.float()).to(device)
    output = model(data).to(device)
    output = output.data.cpu().numpy().ravel()
    answer = np.append(answer, output)
    
submission = pd.read_csv("sample_submission.csv")
submission["failure"] = answer
submission.reset_index(drop=True).to_csv("submission.csv", index=False)
