In [6]:
import os
import glob
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
import sys
sys.path.append('..')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F
from torch.utils.data import DataLoader

from utils import create_save_folder, EarlyStopping

os.environ["CUDA_VISIBLE_DEVICES"]="0"

fm_path = create_save_folder('nb012')

In [7]:
class denoising_model(nn.Module):
    def __init__(self, num_columns):
        super(denoising_model,self).__init__()
        self.encoder=nn.Sequential(
            nn.Linear(num_columns,256),
            # nn.BatchNorm1d(256),
            nn.SiLU(True),
            nn.Linear(256,128),
            # nn.BatchNorm1d(128),
            nn.SiLU(True),
        )
        
        self.decoder=nn.Sequential(
            nn.Linear(128,256),
            # nn.BatchNorm1d(256),
            nn.SiLU(True),
            nn.Linear(256, num_columns),
            # nn.BatchNorm1d(num_columns),
            nn.SiLU(True),
        )

    def forward(self,x):
        x=self.encoder(x)
        x=self.decoder(x)
        
        return x
    
    def encode(self, x):
        return self.encoder(x)

In [8]:
class DataSet:
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        # noise = torch.randn(self.data.shape[1]).cuda()
        # clean = self.data[index]
        # dirty = self.data[index] + noise
        # return clean, dirty
        return self.data[index]


In [9]:
with open('/home/yoshikawa/work/kaggle/OPVP/output/feature_model/20210824/0/train.pkl', 'rb') as f:
    df_train = pickle.load(f)

In [10]:
train = df_train.drop(['row_id', 'target'], axis=1)
for col in train.columns.to_list():
    train[col] = train[col].fillna(train[col].mean())

scales = train.drop(["stock_id"], axis = 1).columns.to_list()

scaler = StandardScaler()
scaler.fit(train[scales])
train[scales] = scaler.transform(train[scales])
le = LabelEncoder()
le.fit(train["stock_id"])
train["stock_id"] = le.transform(train["stock_id"])

In [11]:
train_data = torch.tensor(train.values.astype(np.float32)).cuda()
train_data.shape[1]

230

In [12]:
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

criterion = nn.MSELoss()
kf = KFold(n_splits=5, shuffle=True, random_state=55)
epochs = 1

cv = 0
models = []
for fold, (train_idx, val_idx) in enumerate(kf.split(train_data)):
    print('fold: ', fold)
    print('='*100)
    train_dataset = DataSet(train_data[train_idx].cuda())
    val_dataset = DataSet(train_data[val_idx].cuda())
    train_loader = DataLoader(train_dataset, 4096, shuffle=True)
    val_loader = DataLoader(val_dataset, 4096)
    
    model = denoising_model(train_data.shape[1]).cuda()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    earlystopping = EarlyStopping(patience=10, verbose=True, path=fm_path+'/checkpoint.pth')

    for i in range(epochs):
        model.train()
        train_loss, val_loss = 0, 0

        for j, data in enumerate(train_loader):
            noise = torch.randn(data.shape).cuda()
            dirty = data + noise
            output = model(dirty)
            loss = criterion(output, data)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * data.shape[0]
        
        train_loss /= len(train_dataset)
        
        for j, data in enumerate(val_loader):
            noise = torch.randn(data.shape).cuda()
            dirty = data + noise
            output = model(dirty)
            loss = criterion(output, data)
            val_loss += loss.item() * data.shape[0]
        
        val_loss /= len(val_dataset)
        if (i+1) % 10 == 0:
            print(i+1, " epoch - train_loss: ", round(train_loss, 4), ", val_loss: ", round(val_loss, 4))
        earlystopping(val_loss, model)
        if earlystopping.early_stop:
            print("Early Stopping!!")
            break
    cv += val_loss
    model.load_state_dict(torch.load(fm_path+'/checkpoint.pth'))
    models.append(model)
cv /= 5
print("cv: ", round(cv, 4))

fold:  0
Validation loss decreased (inf --> 0.422317).  Saving model ...
fold:  1
Validation loss decreased (inf --> 0.435681).  Saving model ...
fold:  2
Validation loss decreased (inf --> 0.433549).  Saving model ...
fold:  3
Validation loss decreased (inf --> 0.420706).  Saving model ...
fold:  4
Validation loss decreased (inf --> 0.422224).  Saving model ...
cv:  0.4269


In [13]:
output = torch.zeros((train_data.shape[0], 128))
for i, model in enumerate(models):
    # train_dataset = DataSet(train_data)
    # train_loader = DataLoader(train_dataset, 4096, shuffle=False)
    # for j, data in enumerate(train_loader):
    output += model.encode(train_data).cpu() / 5
    torch.save(model.state_dict(), fm_path+'/model-'+str(i))


In [14]:
output

tensor([[4.8065e-02, 1.1818e-01, 4.6098e-02,  ..., 1.7241e-01, 9.0036e-02,
         2.3407e-01],
        [1.6834e-01, 2.8846e-01, 4.4912e-01,  ..., 2.0677e-01, 3.2622e-03,
         1.4922e-01],
        [5.6044e-02, 2.6877e-01, 3.5047e-01,  ..., 1.4037e-01, 3.0234e-03,
         1.3634e-01],
        ...,
        [7.4567e+00, 7.5820e+00, 9.1783e+00,  ..., 8.6905e+00, 7.7450e+00,
         3.4075e+00],
        [7.4372e+00, 7.5487e+00, 9.2856e+00,  ..., 8.6053e+00, 7.9176e+00,
         2.9307e+00],
        [7.3039e+00, 7.4992e+00, 9.4639e+00,  ..., 8.5865e+00, 7.2663e+00,
         2.7058e+00]], grad_fn=<AddBackward0>)

In [15]:
output.shape

torch.Size([428932, 128])

In [16]:
df_output = pd.DataFrame(output.detach().numpy())
df_output.columns = ['DAE_'+ str(i) for i in df_output.columns]
df_output.describe()

In [None]:
df = pd.concat([df_train, df_output], axis=1)
pickle.dump(df, open(os.path.join(fm_path, "train.pkl"), 'wb'))