In [2]:
import random
from datetime import datetime
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from tqdm import tqdm

In [150]:
def data_organize_columns(data, replace_dict):
     
    # remove 'voted' if trainset
    if 'voted' in data.columns:
        data = data.drop(['voted'], axis=1)
    
    # replace dict
    data = data.astype(replace_dict)
    # expanse colums
    data = pd.get_dummies(data)
   
    print('* Oganize - data type: pd, data shape: ', data.shape)
    return data

def data_value_tansform(data):
    data_np = data.to_numpy()
    
    data_np[:, :41] = (data_np[:, :41] - 3.) / 2. # Q
    data_np[:, 41] = (data_np[:, 41] - 5.) / 5. # familysize
    data_np[:, 43:53] = (data_np[:, 43:53] - 3.5) / 3.5 # tp
    
    data_tf = pd.DataFrame(data_np, columns=data.columns)

    print('* Val transform - data type: pd, data shape: ', data_tf.shape)
    return data_tf

def data_drop(data, drop_list):
            
    data = data.drop(drop_list, axis=1)
    print('* Drop columns - data type: pd, data shape: ', data.shape)

    return data


def make_x(data, replace_dict, drop_list):
    data = data_organize_columns(data, replace_dict)
    data = data_value_tansform(data)
    data = data_drop(data, drop_list)
    data = data.to_numpy()
    
    print('==output x==', data.shape)
    return data
    
    
def make_y(data):
    y = data['voted']
    y = 2 - y.to_numpy()
    
    print('==output y===', y.shape)
    return y

# 1. 데이터 준비

In [151]:
train_data = pd.read_csv('../0-Data/org/train.csv').drop([379, 24598], axis=0)
test_data = pd.read_csv('../0-Data/org/test_x.csv')

replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}
drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE',
             'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE',
             'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index', 'hand']

In [157]:
x_train = make_x(train_data, replace_dict, drop_list)
y_train = make_y(train_data)

* Oganize - data type: pd, data shape:  (45530, 113)
* Val transform - data type: pd, data shape:  (45530, 113)
* Drop columns - data type: pd, data shape:  (45530, 91)
==output x== (45530, 91)
==output y=== (45530,)


In [173]:
x_test = make_x(test_data, replace_dict, drop_list)

* Oganize - data type: pd, data shape:  (11383, 113)
* Val transform - data type: pd, data shape:  (11383, 113)
* Drop columns - data type: pd, data shape:  (11383, 91)
==output x== (11383, 91)


In [174]:
train_x = torch.tensor(x_train, dtype=torch.float32)
train_y = torch.tensor(y_train, dtype=torch.float32)
test_x = torch.tensor(x_test, dtype=torch.float32)

In [175]:
from sklearn.model_selection import train_test_split
train_x, valid_x, train_y, valid_y  = train_test_split(train_x, train_y, test_size=0.2, random_state=123)

In [176]:
train_x.shape, train_y.shape

(torch.Size([36424, 91]), torch.Size([36424]))

# 2. 모델 훈련

In [177]:
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'


N_MODEL = 10
N_EPOCH = 20
BATCH_SIZE = 64
LOADER_PARAM = {
    'batch_size': BATCH_SIZE,
    'num_workers': 4,
    'pin_memory': True
}

prediction_val = np.zeros((len(valid_x), 1), dtype=np.float32)
prediction = np.zeros((len(test_x), 1), dtype=np.float32)


for no in range(N_MODEL):

    train_loader = DataLoader(TensorDataset(train_x, train_y),
                              shuffle=True, drop_last=True, **LOADER_PARAM)
    
    valid_loader = DataLoader(TensorDataset(valid_x, torch.zeros((len(valid_x),), dtype=torch.float32)),
                              shuffle=False, drop_last=False, **LOADER_PARAM)
    
    test_loader = DataLoader(TensorDataset(test_x, torch.zeros((len(test_x),), dtype=torch.float32)),
                             shuffle=False, drop_last=False, **LOADER_PARAM)
    model = nn.Sequential(
        nn.Dropout(0.05),
        nn.Linear(91, 96, bias=False),
        nn.LeakyReLU(0.05, inplace=True),
        
        nn.Dropout(0.5),
        nn.Linear(96, 36, bias=False),
        nn.ReLU(inplace=True),
        
        nn.Dropout(0.5),
        nn.Linear(36, 12, bias=False),
        nn.ReLU(inplace=True),
        
        nn.Linear(12, 1)
    ).to(DEVICE)
    
    
    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=4e-2)
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, T_0=N_EPOCH // 4, eta_min=1.2e-5)

    model.train()
    for epoch in tqdm(range(N_EPOCH), desc='{:02d}/{:02d}'.format(no + 1, N_MODEL)):
        for idx, (xx, yy) in enumerate(train_loader):
            optimizer.zero_grad()
            xx, yy = xx.to(DEVICE), yy.to(DEVICE)
            pred = model(xx).squeeze()
            loss = criterion(pred, yy)
            loss.backward()
            optimizer.step()
            scheduler.step(epoch + idx / len(train_loader))
        
    model.eval()
    with torch.no_grad():
        for idx, (xx, _) in enumerate(valid_loader):
            xx = xx.to(DEVICE)
            pred_val = (torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
            prediction_val[BATCH_SIZE * idx:min(BATCH_SIZE * (idx + 1), len(prediction_val)), :] \
                += pred_val[:, :] / N_MODEL
        
        for idx, (xx, _) in enumerate(test_loader):
            xx = xx.to(DEVICE)
            pred = (2. - torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
            prediction[BATCH_SIZE * idx:min(BATCH_SIZE * (idx + 1), len(prediction)), :] \
                += pred[:, :] / N_MODEL
            

01/10: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s]
02/10:   5%|▌         | 1/20 [00:01<00:35,  1.86s/it]


KeyboardInterrupt: 

# 3. AUC, ACC 예측

In [162]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

def get_auc_acc(y_true, y_pred):
    y_pred = y_pred.reshape(len(y_pred))
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred, pos_label=1)
    roc_auc = metrics.auc(fpr, tpr)
    accuracy_score(y_tre, np.round(y_pred))
    
    return roc_auc, accuracy_score

In [17]:
get_auc

0.6982209532176586

# 결과 저장

In [12]:
df = pd.read_csv('../0-Data/org/sample_submission.csv')
df.iloc[:, 1:] = prediction
df.to_csv('./results/{}.csv'.format(datetime.now().strftime('%m%d-%H%M')), index=False)