In [1]:
import os
import cv2
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader, Subset
import torchvision.models as models
import albumentations as A
from albumentations.pytorch import ToTensor

import torch.nn.functional as F
import torch.nn as nn
from efficientnet_pytorch import EfficientNet

In [2]:
# reproduce
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(42)

In [3]:
df_train = pd.read_csv("../features/train_features.csv")
df_test =pd.read_csv("../features/test_features.csv")

print(f"train shape is {df_train.shape}, test shape is {df_test.shape}")

use_cols = [col for col in df_train.columns if col != "target"]
df_train.head()

train shape is (400, 513), test shape is (600, 513)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,target
0,0.295882,0.635077,1.254051,0.728852,0.781536,0.007389,0.442527,0.369949,1.356158,1.426878,...,0.253921,0.020955,1.257562,0.0,0.204086,0.486144,0.233851,0.423551,0.143036,2
1,0.512975,1.434803,0.714547,2.781812,0.586925,0.294715,0.111564,0.0,1.036614,0.812411,...,0.304124,0.0,0.967409,0.041328,0.128031,0.026276,1.282383,0.662024,0.075688,2
2,0.006773,0.157841,0.083455,0.699392,0.469634,0.0,0.317674,1.343657,1.384265,1.111747,...,0.93876,0.748225,0.645609,1.826061,0.903259,0.39536,0.444958,0.230835,0.195431,1
3,0.46354,1.747067,0.770353,1.479718,1.32218,0.137715,0.460434,0.0,0.00218,1.05262,...,0.57936,0.0,0.860838,0.024148,0.079608,0.252127,1.008662,0.050912,0.034879,2
4,0.0,0.279994,0.745123,0.397075,0.269316,0.131174,0.351435,0.243813,0.504081,0.908447,...,0.072723,0.841074,0.235101,1.518828,0.309512,0.164434,0.06339,2.291218,0.416894,1


In [4]:
class UmdFeaturesDataset(Dataset):
    def __init__(self, df, use_cols):
        self.df = df
        self.use_cols = use_cols

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        return self.df.loc[index, self.use_cols].values

In [5]:
class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(nn.Linear(512, 256), #nn.ReLU(True),
                                     nn.Linear(256, 128),
                                     nn.Linear(128, 16), #nn.ReLU(True),
                                    )
        self.decoder = nn.Sequential(nn.Linear(16, 128), #nn.ReLU(True),
                                     nn.Linear(128, 256),
                                     nn.Linear(256, 512),
                                    )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [6]:
dataset = UmdFeaturesDataset(df_train, use_cols)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

In [7]:
model = autoencoder().cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

In [8]:
num_epochs = 30
for epoch in range(num_epochs):
    for data in dataloader:
        img = data.cuda()
        output = model(img.float())
        loss = criterion(output, img.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print('epoch [{}/{}], loss:{:.4f}'.format(epoch + 1, num_epochs, loss.cpu().detach().numpy()))

epoch [1/30], loss:0.1895
epoch [2/30], loss:0.0683
epoch [3/30], loss:0.0479
epoch [4/30], loss:0.0435
epoch [5/30], loss:0.0412
epoch [6/30], loss:0.0395
epoch [7/30], loss:0.0385
epoch [8/30], loss:0.0378
epoch [9/30], loss:0.0374
epoch [10/30], loss:0.0380
epoch [11/30], loss:0.0372
epoch [12/30], loss:0.0359
epoch [13/30], loss:0.0350
epoch [14/30], loss:0.0347
epoch [15/30], loss:0.0345
epoch [16/30], loss:0.0339
epoch [17/30], loss:0.0339
epoch [18/30], loss:0.0332
epoch [19/30], loss:0.0328
epoch [20/30], loss:0.0326
epoch [21/30], loss:0.0323
epoch [22/30], loss:0.0323
epoch [23/30], loss:0.0329
epoch [24/30], loss:0.0323
epoch [25/30], loss:0.0321
epoch [26/30], loss:0.0321
epoch [27/30], loss:0.0322
epoch [28/30], loss:0.0324
epoch [29/30], loss:0.0330
epoch [30/30], loss:0.0323


In [9]:
dataset_test = UmdFeaturesDataset(df_test, use_cols)
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False)

In [10]:
ae_features_train = []
for data in dataloader:
    img = data.cuda()
    output = model.encoder(img.float())
    ae_features_train.append(output.cpu().detach().numpy().squeeze(0))
    
ae_features_train = np.array(ae_features_train)

print(ae_features_train.shape)

(400, 16)


In [11]:
ae_features_test = []
for data in dataloader_test:
    img = data.cuda()
    output = model.encoder(img.float())
    ae_features_test.append(output.cpu().detach().numpy().squeeze(0))
    
ae_features_test = np.array(ae_features_test)

ae_features_test.shape

(600, 16)

In [12]:
df_train_ae_features = pd.DataFrame(ae_features_train)
df_train_ae_features["target"] = df_train["target"]
print(df_train_ae_features.shape)
df_train_ae_features.head()

(400, 17)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,target
0,0.610736,-0.337333,-0.657294,2.08737,2.248818,-2.758557,-0.477977,-0.291118,0.279418,-0.490819,-2.145393,1.615029,1.509418,0.902009,1.764596,-3.440548,2
1,-0.104558,-1.90869,0.44953,1.983959,2.63269,0.89019,-0.891675,1.987986,-0.010579,3.598642,-1.927087,-1.10896,0.926038,-2.564234,3.072474,-2.779855,2
2,-3.321125,-1.206319,-1.886718,-0.795273,1.225427,0.464719,2.555891,1.271495,-1.897769,-0.590961,1.271029,0.492107,2.180928,-0.623522,-0.874578,-4.934771,1
3,0.864346,-0.525374,1.131916,-0.42388,2.942007,-1.277189,-2.156355,2.198337,0.505102,0.657256,-0.859956,0.764554,-1.164456,-1.445969,1.973658,-3.386367,2
4,-1.783989,-2.076374,-1.387144,0.463143,1.802365,1.201836,0.579067,0.384077,-1.308805,1.131906,-0.44763,0.586131,1.097174,0.447768,-3.553014,-3.482644,1


In [13]:
df_test_ae_features = pd.DataFrame(ae_features_test)
df_test_ae_features["target"] = df_test["target"]
print(df_test_ae_features.shape)
df_test_ae_features.head()

(600, 17)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,target
0,0.238027,-2.38396,-2.846423,-0.592817,1.990361,0.352761,-0.023752,0.515686,-1.128931,2.77207,-3.112388,1.887645,0.39404,1.497338,-3.048116,-1.653428,1
1,-2.622615,-1.786341,-1.356155,0.288921,1.586998,1.186679,2.464078,0.482057,-2.443752,-0.735624,3.831003,-0.730379,2.168365,-1.138161,-1.342934,-4.938066,1
2,-0.26331,-0.517751,-0.508087,2.223155,0.067676,0.630786,0.874048,-1.291025,-0.191419,-1.280884,-1.312767,0.456427,-0.367177,-1.10928,0.067458,-1.688941,3
3,0.4196,-0.412565,-0.019293,3.223381,0.995891,-1.453975,-0.350824,1.531036,-0.768052,0.988011,-0.431069,-0.017008,-0.273338,-3.196411,0.390245,-2.910585,2
4,-0.354733,-1.380823,-0.148537,1.919893,2.08299,1.007159,0.786581,2.188686,-0.767764,3.679501,-2.045792,-1.935513,0.67411,-3.419502,3.092063,-2.315326,2


In [14]:
# df_train_ae_features.to_csv("../features/train_ae_features.csv", index=False)
# df_test_ae_features.to_csv("../features/test_ae_features.csv", index=False)

In [15]:
df_blend =pd.read_csv("../features/blend38_features.csv")
dataset_blend = UmdFeaturesDataset(df_blend, use_cols)
dataloader_blend = DataLoader(dataset_blend, batch_size=1, shuffle=False)

In [16]:
ae_features_blend = []
for data in dataloader_blend:
    img = data.cuda()
    output = model.encoder(img.float())
    ae_features_blend.append(output.cpu().detach().numpy().squeeze(0))
    
ae_features_blend = np.array(ae_features_blend)
df_blend_ae_features = pd.DataFrame(ae_features_blend)
df_blend_ae_features["target"] = df_blend["target"]
print(df_blend_ae_features.shape)
df_blend_ae_features.head()

(400, 17)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,target
0,-2.122239,-0.970497,0.778513,1.485298,3.37186,-0.73869,1.637035,-3.159073,-2.139673,-1.315056,1.716063,0.457103,-0.374935,-1.167396,0.455828,-1.742553,12
1,-1.924364,-1.391297,-0.782174,2.911233,4.035865,-1.197251,-0.997677,-1.900941,-2.90727,0.294716,3.252317,-1.174236,-1.514958,-1.563086,-2.997107,-0.658274,12
2,-3.488093,-2.192553,0.890934,1.798642,3.527046,1.043523,1.999142,-2.812357,-2.267051,-2.196436,0.457524,-1.196792,-0.276024,-2.36649,-1.050874,-2.223322,12
3,-1.293548,-0.522503,-1.259846,0.187979,2.973392,-1.441883,-0.100572,-1.060828,-2.816274,1.584322,3.410616,-0.441557,0.82726,-0.671347,-2.025939,-1.877917,12
4,-2.646029,-1.040226,-0.132654,1.021124,2.923583,0.380751,0.402618,0.268545,-2.276425,-0.511504,0.484493,0.165615,-1.930479,-1.476078,-1.357735,-0.890898,12


In [17]:
# df_blend_ae_features.to_csv("../features/blend38_ae_features.csv", index=False)