In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset, random_split
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# define the autoencoder (with more layers this time)
class Autoencoder(nn.Module):
  def __init__(self, input_size, first_hidden_layer_size, second_hidden_layer_size, latent_size):
    super(Autoencoder, self).__init__()
    self.encoder = nn.Sequential(
      nn.Linear(input_size, first_hidden_layer_size),
      nn.ReLU(),
      nn.Linear(first_hidden_layer_size, second_hidden_layer_size),
      nn.ReLU(),
      nn.Linear(second_hidden_layer_size, latent_size),
      nn.ReLU()
    )
    self.decoder = nn.Sequential(
      nn.Linear(latent_size, second_hidden_layer_size),
      nn.ReLU(),
      nn.Linear(second_hidden_layer_size, first_hidden_layer_size),
      nn.ReLU(),
      nn.Linear(first_hidden_layer_size, input_size),
      nn.ReLU()
    )

  def forward(self, x):
    x = self.encoder(x)
    
    x = self.decoder(x)
    return x

In [3]:
# read in the unnormalized feature matrix
data_np = np.loadtxt('feature_matrix_path.csv', delimiter=',') #TODO: change path
data = data_np

In [4]:
data

array([[ 21.,   0.,   0., ...,   0.,   0.,  12.],
       [ 10.,   3.,   0., ...,   1.,   0.,   9.],
       [  8.,   0.,   0., ...,   1.,   1.,   5.],
       ...,
       [ 30.,   1.,   0., ...,   1.,   1.,  11.],
       [618.,   1.,   3., ...,   1.,   0.,  21.],
       [ 20.,   2.,   0., ...,   0.,   0.,  18.]])

In [5]:
# normalize feature matrix by row (sample)
max_values = data.max(axis=1)
data = data / max_values[:, None]

data

array([[2.47058824e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.41176471e-01],
       [5.55555556e-02, 1.66666667e-02, 0.00000000e+00, ...,
        5.55555556e-03, 0.00000000e+00, 5.00000000e-02],
       [4.54545455e-02, 0.00000000e+00, 0.00000000e+00, ...,
        5.68181818e-03, 5.68181818e-03, 2.84090909e-02],
       ...,
       [3.12500000e-01, 1.04166667e-02, 0.00000000e+00, ...,
        1.04166667e-02, 1.04166667e-02, 1.14583333e-01],
       [3.63614968e-02, 5.88373735e-05, 1.76512120e-04, ...,
        5.88373735e-05, 0.00000000e+00, 1.23558484e-03],
       [1.35135135e-01, 1.35135135e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.21621622e-01]])

In [8]:
# check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)  # to run on GPU on colab, go to runtime > change runtime type > T4 GPU

# define dimensions
input_size = data.shape[1]
first_hidden_layer_size = 1500
second_hidden_layer_size = 300
latent_size = 100

# make model, instance of autoencoder
model = Autoencoder(input_size, first_hidden_layer_size, second_hidden_layer_size, latent_size).to(device)

# binary cross entropy (since output is in [0, 1])
loss_fn = nn.MSELoss()

# optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# convert to tensor so it can be inputted
data = torch.tensor(data, dtype=torch.float).to(device)

# create dataset
dataset = TensorDataset(data, data)  # the first arg is the input, the second is the target. for autoencoder they're the same

# split data into training and validation sets (80% train, 20% val)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# make dataloaders for training and validation
batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

train_losses = []
val_losses = []


# train model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()  # set model to training mode
    epoch_loss = 0.0
    for batch_data, _ in train_dataloader:
        batch_data = batch_data.to(device)

        # forward pass
        outputs = model(batch_data)
        loss = loss_fn(outputs, batch_data)
        epoch_loss += loss.item()

        # backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_losses.append(epoch_loss / len(train_dataloader))

    # validation loss
    model.eval()  # set model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():  # no gradient calculation for validation
        for batch_data, _ in val_dataloader:
            batch_data = batch_data.to(device)

            # forward pass
            outputs = model(batch_data)
            loss = loss_fn(outputs, batch_data)
            val_loss += loss.item()

    val_losses.append(val_loss / len(val_dataloader))

    # print epoch loss and validation loss
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}] - '
              f'Training Loss: {epoch_loss / len(train_dataloader)} - '
              f'Validation Loss: {val_loss / len(val_dataloader)}')


In [9]:
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss over Epochs')
plt.legend()
plt.grid(True)
plt.show()

In [10]:
mean_values = data.mean(dim=0)
mean_predictions = mean_values.expand_as(data)
mean_loss = loss_fn(mean_predictions, data)
print('MSE loss for predicting average value:', mean_loss.item())

MSE loss for predicting average value: 9.174252772936597e-05


In [10]:
model.eval()

Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=532446, out_features=1500, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1500, out_features=300, bias=True)
    (3): ReLU()
    (4): Linear(in_features=300, out_features=100, bias=True)
    (5): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=100, out_features=300, bias=True)
    (1): ReLU()
    (2): Linear(in_features=300, out_features=1500, bias=True)
    (3): ReLU()
    (4): Linear(in_features=1500, out_features=532446, bias=True)
    (5): ReLU()
  )
)

In [19]:
# torch.save(model.state_dict(), 'autoencoder_15_1000.pth') #TODO: change path (if necessary)