In [2]:
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.utils.data import DataLoader
from options import args
from models.base import BaseModel
from trainers.base import AbstractTrainer
from models import model_factory
from dataloaders import dataloader_factory
from trainers import trainer_factory
from utils import *
from tqdm import tqdm

In [30]:
args.dataloader_code = 'ae'
args.dataset_code = 'ml-1m'
train_loader, valid_loader, test_loader = dataloader_factory(args)

Already preprocessed. Skip preprocessing


In [31]:
x = train_loader.dataset.data.transpose(0,1)
valid = valid_loader.dataset.input_data.transpose(0,1)
test = test_loader.dataset.input_data.transpose(0,1)

In [32]:
train_loader = torch.utils.data.DataLoader(x, batch_size=args.train_batch_size,
                                           shuffle=True, pin_memory=True)

In [36]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.hidden_dim = None
        self.latent_dim = None
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, latent_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim),
            nn.Sigmoid()
        )
        self.encoder.apply(self.weight_init)
        self.decoder.apply(self.weight_init)
        
    def weight_init(self, m):
        if isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(m.weight)
            m.bias.data.normal_(0.0, 0.001)
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [35]:
train_loader.dataset.data.shape

torch.Size([3525, 6034])

In [37]:

num_samples = train_loader.dataset.data.shape[0]
input_dim = train_loader.dataset.data.shape[1]
hidden_dim = 0
latent_dim = 256
learning_rate = args.lr
num_epochs = 100

model = Autoencoder(input_dim,hidden_dim,latent_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [38]:
#Training
from tqdm import tqdm
train_loader = tqdm(train_loader)
for epoch in range(num_epochs):
    for batch_index, batch in enumerate(train_loader):
        # batch_size = batch[0].size(0)
        batch = [x.to(args.device) for x in batch]
        input_x = torch.stack(batch)
        decoded = model(input_x)
        loss = criterion(decoded, input_x)
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

100%|██████████| 56/56 [00:06<00:00,  8.22it/s]


Epoch [1/100], Loss: 0.1851
Epoch [2/100], Loss: 0.1026
Epoch [3/100], Loss: 0.0620
Epoch [4/100], Loss: 0.0407
Epoch [5/100], Loss: 0.0349
Epoch [6/100], Loss: 0.0520
Epoch [7/100], Loss: 0.0217
Epoch [8/100], Loss: 0.0864
Epoch [9/100], Loss: 0.0817
Epoch [10/100], Loss: 0.1479
Epoch [11/100], Loss: 0.0512
Epoch [12/100], Loss: 0.0973
Epoch [13/100], Loss: 0.0367
Epoch [14/100], Loss: 0.0136
Epoch [15/100], Loss: 0.0320
Epoch [16/100], Loss: 0.0893
Epoch [17/100], Loss: 0.0342
Epoch [18/100], Loss: 0.0242
Epoch [19/100], Loss: 0.0381
Epoch [20/100], Loss: 0.0585
Epoch [21/100], Loss: 0.0389
Epoch [22/100], Loss: 0.0378
Epoch [23/100], Loss: 0.0763
Epoch [24/100], Loss: 0.0527
Epoch [25/100], Loss: 0.0299
Epoch [26/100], Loss: 0.0220
Epoch [27/100], Loss: 0.0336
Epoch [28/100], Loss: 0.0271
Epoch [29/100], Loss: 0.0119
Epoch [30/100], Loss: 0.0199
Epoch [31/100], Loss: 0.0563
Epoch [32/100], Loss: 0.0898
Epoch [33/100], Loss: 0.0238
Epoch [34/100], Loss: 0.0065
Epoch [35/100], Loss: 0

In [39]:
encoder = model.encoder
encoded_train_data = encoder(x).detach().numpy()

In [40]:
encoded_train_data

array([[0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [2.2702747e+01, 6.4518142e-01, 0.0000000e+00, ..., 0.0000000e+00,
        2.5275168e+00, 1.0268612e+00],
       [2.0505822e+00, 3.3102605e+00, 4.4197354e-01, ..., 0.0000000e+00,
        8.7024975e-01, 1.5305116e+00],
       ...,
       [1.9408367e+00, 1.2079253e+00, 4.8965599e-02, ..., 1.4051836e+00,
        7.2019815e-01, 5.3642499e-01],
       [1.2958812e+00, 2.2203176e+00, 1.9255825e+00, ..., 1.3615280e+00,
        3.0987117e+00, 6.4951622e-01],
       [7.4866928e-02, 8.0792179e+00, 1.0717744e-02, ..., 0.0000000e+00,
        1.6658572e+00, 9.7944188e-01]], dtype=float32)

In [48]:
decoded  = model(x)
# import numpy as np 
from sklearn.metrics import roc_auc_score
roc_auc_score(np.array(x),np.where(decoded.detach().numpy() > 0.5, 1., 0.))

KeyboardInterrupt: 

In [47]:
np.where(decoded.detach().numpy() > 0.5, 1., 0.)

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [60]:
encoded = model.encoder(x)
u,s,v = torch.svd(encoded)
u.shape,s.shape,v.shape

(torch.Size([6034, 256]), torch.Size([256]), torch.Size([256, 256]))

In [89]:
torch.save(v,'item_v.pt')