In [1]:
import os
import glob
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold, GroupKFold
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
import sys
sys.path.append('..')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset


from utils import create_save_folder, EarlyStopping

os.environ["CUDA_VISIBLE_DEVICES"]="2"

fm_path = create_save_folder('nb012')

In [3]:
class denoising_model(nn.Module):
    def __init__(self, num_columns):
        super(denoising_model,self).__init__()
        self.encoder=nn.Sequential(
            nn.Linear(num_columns,256),
            # nn.BatchNorm1d(256),
            nn.SiLU(True),
            nn.Linear(256,128),
            # nn.BatchNorm1d(128),
            nn.SiLU(True),
        )
        
        self.decoder=nn.Sequential(
            nn.Linear(128,256),
            # nn.BatchNorm1d(256),
            nn.SiLU(True),
            nn.Linear(256, num_columns),
            # nn.BatchNorm1d(num_columns),
            nn.SiLU(True),
        )

        self.label_output = nn.Sequential(
            nn.Linear(num_columns, 256),
            nn.SiLU(True),
            nn.Linear(256, 64),
            nn.SiLU(True),
            nn.Linear(64, 1),
        )

    def forward(self, x, noise):
        x = x + noise
        x=self.encoder(x)
        x=self.decoder(x)
        output = self.label_output(x)
        return x, output
    
    def encode(self, x, noise):
        x = x + noise
        return self.encoder(x)

In [3]:
class myDataSet(Dataset):
    def __init__(self, data, y):
        self.data = data
        self.y = y
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        return self.data[index], self.y[index]



In [4]:
with open('/home/yoshikawa/work/kaggle/OPVP/output/feature_model/20210920/lgbm_group_kfold_time_id/train.pkl', 'rb') as f:
    df_train = pickle.load(f)

In [5]:
train = df_train.drop(['row_id'], axis=1)
for col in train.columns.to_list():
    train[col] = train[col].fillna(train[col].mean())

scales = train.drop(["stock_id"], axis = 1).columns.to_list()

scaler = StandardScaler()
scaler.fit(train[scales])
train[scales] = scaler.transform(train[scales])
le = LabelEncoder()
le.fit(train["stock_id"])
train["stock_id"] = le.transform(train["stock_id"])

In [10]:
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

criterion = nn.MSELoss()
epochs = 10000

train_data = torch.tensor(train.drop(['time_id', 'target'], axis=1).values.astype(np.float32))
y_data = torch.tensor(train[['target']].values.astype(np.float32))

kf = GroupKFold(n_splits=5)
group = train['time_id']
cv = 0
models = []
for fold, (train_idx, val_idx) in enumerate(kf.split(train_data, groups=group)):
    print('fold: ', fold)
    print('='*100)

    train_dataset = myDataSet(train_data[train_idx], y_data[train_idx])
    val_dataset = myDataSet(train_data[val_idx], y_data[val_idx])
    train_loader = DataLoader(train_dataset, 4096, shuffle=True)
    val_loader = DataLoader(val_dataset, 4096)
    
    model = denoising_model(train_data.shape[1]).cuda()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    earlystopping = EarlyStopping(patience=20, verbose=True, path=fm_path+'/checkpoint.pth')

    for i in range(epochs):
        model.train()
        train_loss, val_loss = 0, 0

        for j, (data, y) in enumerate(train_loader):
            data = data.cuda()
            y = y.cuda()
            noise = torch.randn(data.shape).cuda()
            recon, output = model(data, noise)
            loss = criterion(recon, data) + criterion(output, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * data.shape[0]
        
        train_loss /= len(train_dataset)
        
        for j, (val_data, val_y) in enumerate(val_loader):
            val_data = val_data.cuda()
            val_y = val_y.cuda()
            val_noise = torch.randn(val_data.shape).cuda()
            recon, output = model(val_data, val_noise)
            loss = criterion(recon, val_data) + criterion(output, val_y)
            loss = criterion(recon, val_data)
            val_loss += loss.item() * val_data.shape[0]
        
        val_loss /= len(val_dataset)
        if (i+1) % 10 == 0:
            print(i+1, " epoch - train_loss: ", round(train_loss, 4), ", val_loss: ", round(val_loss, 4))
        earlystopping(val_loss, model)
        if earlystopping.early_stop:
            print("Early Stopping!!")
            break
    cv += val_loss
    model.load_state_dict(torch.load(fm_path+'/checkpoint.pth'))
    models.append(model)
cv /= 5
print("cv: ", round(cv, 4))

fold:  0
Validation loss decreased (inf --> 0.643870).  Saving model ...
Validation loss decreased (0.643870 --> 0.553839).  Saving model ...
Validation loss decreased (0.553839 --> 0.510465).  Saving model ...
Validation loss decreased (0.510465 --> 0.482257).  Saving model ...
Validation loss decreased (0.482257 --> 0.464825).  Saving model ...
Validation loss decreased (0.464825 --> 0.451863).  Saving model ...
Validation loss decreased (0.451863 --> 0.443781).  Saving model ...
Validation loss decreased (0.443781 --> 0.435625).  Saving model ...
Validation loss decreased (0.435625 --> 0.432111).  Saving model ...
10  epoch - train_loss:  0.5838 , val_loss:  0.4272
Validation loss decreased (0.432111 --> 0.427196).  Saving model ...
Validation loss decreased (0.427196 --> 0.424668).  Saving model ...
Validation loss decreased (0.424668 --> 0.420094).  Saving model ...
EarlyStopping counter: 1 out of 20
Validation loss decreased (0.420094 --> 0.417768).  Saving model ...
Validation l

In [17]:
output = torch.zeros((train_data.shape[0], 128))
for i, model in enumerate(models):
    # train_dataset = DataSet(train_data)
    # train_loader = DataLoader(train_dataset, 4096, shuffle=False)
    # for j, data in enumerate(train_loader):
    # noise = torch.randn(train_data.shape).cuda()
    # output += model.encode(train_data.cuda(), noise) / 5
    torch.save(model.state_dict().cpu(), fm_path+'/DNAEmodel-'+str(i))


In [7]:
for i in range(5):
    model = denoising_model(df_train.shape[1]-3)
    model.load_state_dict(torch.load(fm_path+'/DNAEmodel-'+str(i)))
    torch.save(model.cpu().state_dict(), fm_path+'/DNAEmodel-'+str(i))
