In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim import corpora, models, similarities, downloader
import nltk
from nltk.tokenize import word_tokenize

Creating a skip connection auto-encoder architecture. 

In [None]:
class Encoder(nn.Module):

    def __init__(self, input_shape, compression_size):

        super().__init__()

        # assuming the embeddings are of dimensions 300 and each sentence has 10 words/tokens

        embedding_dimension = 300
        num_of_words = 10

        mlp_input_size = int(embedding_dimension * num_of_words)

        self.layer1 = nn.Linear(mlp_input_size, 2400)
        self.layer2 = nn.Linear(2400, 1500)
        self.layer3 = nn.Linear(1500, compression_size)
        

        self.relu = nn.ReLU()
        
        #################################################### 

        self.flatten = nn.Flatten()


    def forward(self, features):

        features_flat = torch.flatten(features, start_dim=1)
        
        #################################################### 

        out1 = self.layer1(features_flat)
        out1 = self.relu(out1)
        
        out2 = self.layer2(out1)
        out2 = self.relu(out2)

        out3 = self.layer3(out2)
    
        #################################################### 

        return out3, out2, out1


In [None]:
class Decoder(nn.Module):

    def __init__(self, input_size, output_shape):
        super().__init__()

        embedding_dimension = 300
        num_of_words = 10

        mlp_output_size = int(embedding_dimension * num_of_words)

        self.layer1 = nn.Linear(input_size, 1500)
        self.layer2 = nn.Linear(1500, 2400)
        self.layer3 = nn.Linear(2400, mlp_output_size)
        
        self.relu = nn.ReLU()


    def forward(self, x1, x2, x3):

        # Pass through the layers
        out1 = self.layer1(x1)
        out1 = self.relu(out1)

        out2 = self.layer2(out1 + x2)
        out2 = self.relu(out2)

        out3 = self.layer3(out2 + x3)

        # Reshaping the final output into a 300-dimensional embedding for 10 words
        batch_size = out3.size(0)
        out = out3.view(batch_size, 10, 300)

        return out

In [None]:
class Autoencoder(nn.Module):
    
    def __init__(self, input_shape, compression_size):
        super().__init__()
        
        self.input_size = input_shape

        self.encoder = Encoder(input_shape, compression_size)
        self.decoder = Decoder(compression_size, input_shape)
        self.relu = nn.ReLU()
        
        

    def forward(self, features):

        # implementing variational auto-encoder.
        
        out1 = self.encoder(features)
        out2 = self.relu(out1)
        # in order to implement variational auto-encoders, the forward pass should return the mean and the variance as well.
        # this will in turn be passed to the loss function which uses them to calculate the kl divergence.
        out2 = self.decoder(out2)

        # out1 is the output for the encoder, out2 is the output for the decoder.
        
        return out2, out1

In [None]:
def autoencoder_training(model, loss_function, optimizer, train_data, n_epochs, update_interval):
    
    losses = []

    for n in range(n_epochs):
        for i, image in enumerate(tqdm(train_data)):

            # pre-processing training data so that we can directly feed it to the model.

            optimizer.zero_grad()

            ex_pred = model(image)

            ex_label = image

            loss = loss_function(ex_pred, ex_label)

            loss.backward()

            optimizer.step()

            ##############################################################
        
            if i % update_interval == 0:
                losses.append(round(loss.item(), 2)) # This will append your losses for plotting -- please use "loss" as the name for your loss
        
    return model, losses

In [None]:
# pre-processing the dataset so that it can be fed to the model.

model = Autoencoder(3000, 900) 
loss_function = nn.MSELoss()                        
optimizer = torch.optim.Adam(model.parameters(), lr=lr) 

model, losses = autoencoder_training(model, loss_function, optimizer, train_data, n_epochs, update_interval)

# testing and validation later on.

Points to Consider:

-) Pre-processing the dataset so that we have converted the texts into a numpy array of M words times N dimensions based on which word2vector training model we are trying to use - utilize word_embedding.ipynb .

-) Once we have our trained model, we can pass the encodings of different layers into the appropriate decoding layers and plot how the loss compares relative to the complete process.

-) Finally , once we have trained our model, defining a function that takes our reshaped output words * dimensions vector embedding and converts it into the most similar words using word2vector pre-defined methods.