## Import

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [55]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

## Data Pre-processing

In [56]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    train_df = pd.concat([train_df, pd.get_dummies(train_df[i])], axis=1)
    test_df = pd.concat([test_df, pd.get_dummies(test_df[i])], axis=1)

# for i in qual_col:
#     le = LabelEncoder()
#     le = le.fit(train_df[i])
#     train_df[i] = le.transform(train_df[i])
    
#     for label in np.unique(test_df[i]): 
#         if label not in le.classes_: 
#             le.classes_ = np.append(le.classes_, label)
#     test_df[i] = le.transform(test_df[i]) 
print('Done.')

Done.


In [32]:
import torch
import pandas as pd
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader, random_split

In [126]:
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        df = dataframe
        self.x = df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality', 'LINE', "PRODUCT_CODE"]).values
        self.y = df['Y_Class'].values
        self.length = len(df)

        
    def __getitem__(self, index):
        x = torch.FloatTensor(self.x[index])
        y = torch.LongTensor([self.y[index]])
        return x, y

    def __len__(self):
        return self.length

In [127]:
dataset = CustomDataset(train_df)
dataset_size = len(dataset)
train_size = int(dataset_size * 0.8)
validation_size = int(dataset_size * 0.1)
test_size = dataset_size - train_size - validation_size

In [128]:
train_dataset, validation_dataset, test_dataset = random_split(dataset, [train_size, validation_size, test_size])

print(f"Training Data Size : {len(train_dataset)}")
print(f"Validation Data Size : {len(validation_dataset)}")
print(f"Testing Data Size : {len(test_dataset)}")

Training Data Size : 478
Validation Data Size : 59
Testing Data Size : 61


In [129]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=4, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True, drop_last=True)

In [130]:
# t = (train_df-train_df.mean())/(train_df.std()+1E-7)

## Classification Model Fit

In [131]:
class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.layer1 = nn.Linear(2884, 512, bias=True)
        self.layer2 = nn.Linear(512, 128, bias=True)
        self.layer3 = nn.Linear(128, 3, bias=True)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout()
        self.bn1 = nn.BatchNorm1d(512)
        self.bn2 = nn.BatchNorm1d(128)

    def forward(self, x):
        x = self.layer1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.layer3(x)
        return x

In [197]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CustomModel().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.1)

In [198]:
for epoch in range(100):
    cost = 0.0

    for x, y in train_dataloader:
        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y.squeeze())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cost += loss

    cost = cost / len(train_dataloader)
    print("cost :", cost.item())

cost : 1.0909390449523926
cost : 1.0263177156448364
cost : 1.0279629230499268
cost : 1.0005724430084229
cost : 0.9837640523910522
cost : 0.9871833920478821
cost : 0.9657143950462341
cost : 0.9579184055328369
cost : 0.962017297744751
cost : 0.976588249206543
cost : 0.9367454648017883
cost : 0.9495082497596741
cost : 0.909460186958313
cost : 0.9174975752830505
cost : 0.9399195909500122
cost : 0.9240788817405701
cost : 0.9124826788902283
cost : 0.9090798497200012
cost : 0.887914776802063
cost : 0.8863229751586914
cost : 0.87555992603302
cost : 0.9076559543609619
cost : 0.8673915266990662
cost : 0.8741575479507446
cost : 0.8733793497085571
cost : 0.8676433563232422
cost : 0.8756645917892456
cost : 0.8666645288467407
cost : 0.8557595610618591
cost : 0.8623166084289551
cost : 0.8302825093269348
cost : 0.845623254776001
cost : 0.8495845794677734
cost : 0.8458868861198425
cost : 0.8311006426811218
cost : 0.8569970726966858
cost : 0.8596088886260986
cost : 0.8392742276191711
cost : 0.8237660527

In [199]:
with torch.no_grad():
    model.eval()
    success = 0
    total = 0
    for x, y in validation_dataloader:
        x = x.to(device)
        y = y.to(device)
        
        outputs = model(x)
        success += (outputs.argmax(dim=-1) == y.squeeze()).sum().item()
        total += x.shape[0]
        print(f"X : {x}")
        print(f"Y : {y}")
        print(f"Outputs : {outputs}")
        print("--------------------")
    print("ratio :",success/total)

X : tensor([[  2., 100.,   0.,  ...,   0.,   0.,   1.],
        [  0.,   0.,   0.,  ...,   1.,   0.,   0.],
        [  0.,   0.,   0.,  ...,   1.,   0.,   0.],
        [  2., 100.,   0.,  ...,   0.,   0.,   1.]], device='cuda:0')
Y : tensor([[1],
        [0],
        [0],
        [1]], device='cuda:0')
Outputs : tensor([[-0.7092,  0.7557, -0.3775],
        [-0.7092,  0.7557, -0.3775],
        [-0.7149, -2.2040,  0.1174],
        [-0.7092,  0.7557, -0.3775]], device='cuda:0')
--------------------
X : tensor([[0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.]], device='cuda:0')
Y : tensor([[1],
        [0],
        [1],
        [1]], device='cuda:0')
Outputs : tensor([[-0.7149, -2.2040,  0.1174],
        [ 0.3700,  0.7466, -0.4396],
        [ 0.3700,  0.7466, -0.4396],
        [ 0.3700,  0.7466, -0.4396]], device='cuda:0')
--------------------
X : tensor([[ 1., 95.,  0.,  ...,  0.,  0.,  1

In [200]:
with torch.no_grad():
    model.eval()
    success = 0
    total = 0
    for x, y in test_dataloader:
        x = x.to(device)
        y = y.to(device)
        
        outputs = model(x)
        success += (outputs.argmax(dim=-1) == y.squeeze()).sum().item()
        total += x.shape[0]
        print(f"X : {x}")
        print(f"Y : {y}")
        print(f"Outputs : {outputs}")
        print("--------------------")
    print("ratio :",success/total)

X : tensor([[ 2., 95.,  0.,  ...,  0.,  0.,  1.],
        [ 2., 98.,  0.,  ...,  0.,  0.,  1.],
        [ 0.,  0.,  0.,  ...,  1.,  0.,  0.],
        [ 2., 89.,  0.,  ...,  0.,  0.,  1.]], device='cuda:0')
Y : tensor([[1],
        [2],
        [2],
        [1]], device='cuda:0')
Outputs : tensor([[-0.7092,  0.7557, -0.3775],
        [-0.7092,  0.7557, -0.3775],
        [-0.7149, -2.2040,  0.1174],
        [-0.7092,  0.7557, -0.3775]], device='cuda:0')
--------------------
X : tensor([[ 0.,  0.,  0.,  ...,  1.,  0.,  0.],
        [ 0.,  0.,  0.,  ...,  1.,  0.,  0.],
        [ 0.,  0.,  0.,  ...,  1.,  0.,  0.],
        [ 2., 98.,  0.,  ...,  0.,  0.,  1.]], device='cuda:0')
Y : tensor([[1],
        [1],
        [0],
        [1]], device='cuda:0')
Outputs : tensor([[-0.7092,  0.7557, -0.3775],
        [-0.7149, -2.2040,  0.1174],
        [ 0.3700,  0.7466, -0.4396],
        [-0.7092,  0.7557, -0.3775]], device='cuda:0')
--------------------
X : tensor([[  2., 100.,   0.,  ...,   0.,   0

## Inference

In [184]:
final_x = torch.FloatTensor(test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'LINE', "PRODUCT_CODE"]).values)
final_x.shape

torch.Size([310, 2884])

In [186]:
with torch.no_grad():
    model.eval()
    outputs = model(final_x.to(device))
    result = outputs.argmax(dim=-1)
    print(f"Outputs : {result}")
    print("--------------------")

Outputs : tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 2,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1,
        1, 1, 1, 1, 2, 2, 1, 1

## Submit

In [188]:
submit = pd.read_csv('./sample_submission.csv')
submit

Unnamed: 0,PRODUCT_ID,Y_Class
0,TEST_000,0
1,TEST_001,0
2,TEST_002,0
3,TEST_003,0
4,TEST_004,0
...,...,...
305,TEST_305,0
306,TEST_306,0
307,TEST_307,0
308,TEST_308,0


In [194]:
submit['Y_Class'] = result.cpu()

In [196]:
submit.to_csv('./submission.csv', index=False)