In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset, random_split
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# define the autoencoder (with more layers this time)
class Autoencoder(nn.Module):
  def __init__(self, input_size, first_hidden_layer_size, second_hidden_layer_size, latent_size):
    super(Autoencoder, self).__init__()
    self.encoder = nn.Sequential(
      nn.Linear(input_size, first_hidden_layer_size),
      nn.ReLU(),
      nn.Linear(first_hidden_layer_size, second_hidden_layer_size),
      nn.ReLU(),
      nn.Linear(second_hidden_layer_size, latent_size),
      nn.ReLU()
    )
    self.decoder = nn.Sequential(
      nn.Linear(latent_size, second_hidden_layer_size),
      nn.ReLU(),
      nn.Linear(second_hidden_layer_size, first_hidden_layer_size),
      nn.ReLU(),
      nn.Linear(first_hidden_layer_size, input_size),
      nn.ReLU()
    )

  def forward(self, x):
    x = self.encoder(x)
    
    x = self.decoder(x)
    return x

In [3]:
# read in the unnormalized feature matrix
data_np = np.loadtxt('feature_matrix_path.csv', delimiter=',') #TODO: change path
data = data_np

In [4]:
data

array([[ 21.,   0.,   0., ...,   0.,   0.,  12.],
       [ 10.,   3.,   0., ...,   1.,   0.,   9.],
       [  8.,   0.,   0., ...,   1.,   1.,   5.],
       ...,
       [ 30.,   1.,   0., ...,   1.,   1.,  11.],
       [618.,   1.,   3., ...,   1.,   0.,  21.],
       [ 20.,   2.,   0., ...,   0.,   0.,  18.]])

In [5]:
# normalize feature matrix by row (sample)
max_values = data.max(axis=1)
data = data / max_values[:, None]

data

array([[2.47058824e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.41176471e-01],
       [5.55555556e-02, 1.66666667e-02, 0.00000000e+00, ...,
        5.55555556e-03, 0.00000000e+00, 5.00000000e-02],
       [4.54545455e-02, 0.00000000e+00, 0.00000000e+00, ...,
        5.68181818e-03, 5.68181818e-03, 2.84090909e-02],
       ...,
       [3.12500000e-01, 1.04166667e-02, 0.00000000e+00, ...,
        1.04166667e-02, 1.04166667e-02, 1.14583333e-01],
       [3.63614968e-02, 5.88373735e-05, 1.76512120e-04, ...,
        5.88373735e-05, 0.00000000e+00, 1.23558484e-03],
       [1.35135135e-01, 1.35135135e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.21621622e-01]])

In [9]:
# load the model we're using
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = data.shape[1]
first_hidden_layer_size = 1500
second_hidden_layer_size = 300
latent_size = 100
model = Autoencoder(input_size, first_hidden_layer_size, second_hidden_layer_size, latent_size)
model.load_state_dict(torch.load('autoencoder_15_1000.pth')) #TODO: change path (if necessary)
model.to(device)
model.eval()

Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=532446, out_features=1500, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1500, out_features=300, bias=True)
    (3): ReLU()
    (4): Linear(in_features=300, out_features=100, bias=True)
    (5): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=100, out_features=300, bias=True)
    (1): ReLU()
    (2): Linear(in_features=300, out_features=1500, bias=True)
    (3): ReLU()
    (4): Linear(in_features=1500, out_features=532446, bias=True)
    (5): ReLU()
  )
)

In [10]:
# convert to tensor so it can be inputted
data = torch.tensor(data, dtype=torch.float).to(device)

# create dataset
dataset = TensorDataset(data, data)  # the first arg is the input, the second is the target. for autoencoder they're the same
embedddings_dataloarder = DataLoader(dataset, batch_size=1, shuffle=False)
embeddings = []
with torch.no_grad():
    for inputs, _ in embedddings_dataloarder:
        inputs = inputs.to(device)
        encoded_data = model.encoder(inputs)[0]
        embeddings.append(encoded_data.cpu())

print(len(embeddings))

1683


In [12]:
# embeddings

In [None]:
embeddings_df = pd.DataFrame([tensor.tolist() for tensor in embeddings])
embeddings_df.to_csv('embeddings_output_path.csv', index=False, header=[f"Dim_{i+1}" for i in range(embeddings_df.shape[1])]) #TODO: change path