In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [3]:
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim import corpora, models, similarities, downloader

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/taru/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
print("Downloading word2vec model")
word2vec_model = gensim.downloader.load('word2vec-google-news-300')

Downloading word2vec model


In [14]:
from nltk.tokenize import word_tokenize

In [15]:
def tokenize(text):
    text = text.lower()
    tokens = word_tokenize(text)
    return tokens

In [16]:
def wordToVector(word, model):
    try:
        return model[word]
    except KeyError:
        # Handle out-of-vocabulary words
        return np.zeros(model.vector_size)  # Return zero vector for OOV words

In [17]:
def textToVectors(text, model):
    tokens = tokenize(text)  # Tokenize the text
    vectors = [wordToVector(token, model) for token in tokens]  # Convert words to vectors
    vectors = np.array(vectors)
    return vectors

Creating a skip connection auto-encoder architecture. 

In [28]:
class SkipConnectionEncoder(nn.Module):

    def __init__(self, input_shape, compression_size):

        super().__init__()

        # Assuming the embeddings are of dimensions 300 and each sentence has 10 words/tokens
        num_of_words = input_shape[0]
        embedding_dim = input_shape[1]

        # Calculate the input size for the MLP
        mlp_input_size = embedding_dim * num_of_words

        # Define layers for the encoder
        self.layer1 = nn.Linear(mlp_input_size, 2400)
        self.layer2 = nn.Linear(2400, 1500)
        self.layer3 = nn.Linear(1500, compression_size)
        
        # Activation function
        self.relu = nn.ReLU()
        


    def forward(self, features):

        # Flatten the input features
        features_flat = features.flatten()

        features_flat = torch.tensor(features_flat, dtype=torch.float32)
        
        # Pass through the first layer
        out1 = self.layer1(features_flat)
        a1 = self.relu(out1)
        
        # Pass through the second layer
        out2 = self.layer2(a1)
        a2 = self.relu(out2)

        # Pass through the third layer
        out3 = self.layer3(a2)
    
        # Return the outputs of each layer
        return out3, out2, out1


In [32]:
class SkipConnectionDecoder(nn.Module):

    def __init__(self, input_size, output_shape):
        super().__init__()

        # Assuming the embeddings are of dimensions 300 and each sentence has 10 words/tokens
        num_of_words = output_shape[0]
        embedding_dim = output_shape[1]

        # Calculate the output size for the MLP
        mlp_output_size = embedding_dim * num_of_words

        # Define layers for the decoder
        self.layer1 = nn.Linear(input_size, 1500)
        self.layer2 = nn.Linear(1500, 2400)
        self.layer3 = nn.Linear(2400, mlp_output_size)
        
        # Activation function
        self.relu = nn.ReLU()


    def forward(self, x1, x2, x3):

        # Pass through the first layer
        out1 = self.layer1(x1)
        a1 = self.relu(out1)

        # Skip connection: add input from previous layer and pass through the second layer
        out2 = self.layer2(a1 + x2)
        a2 = self.relu(out2)

        # Skip connection: add input from previous layer and pass through the third layer
        out3 = self.layer3(a2 + x3)

        # Reshape the final output into a 300-dimensional embedding for 10 words
        out = out3.view(13, 300)

        return out

In [33]:
class Autoencoder(nn.Module):
    
    def __init__(self, input_shape, compression_size):
        super().__init__()
        
        self.input_size = input_shape

        self.encoder = SkipConnectionEncoder(input_shape, compression_size)
        self.decoder = SkipConnectionDecoder(compression_size, input_shape)
        self.relu = nn.ReLU()
        
        

    def forward(self, features):

        # implementing auto-encoder.
        
        encoded, _ , _ = self.encoder(features)
        activatedEncoded = self.relu(encoded)
        # in order to implement variational auto-encoders, the forward pass should return the mean and the variance as well.
        # this will in turn be passed to the loss function which uses them to calculate the kl divergence.
        decoded = self.decoder(activatedEncoded, 0, 0)

        # out1 is the output for the encoder, out2 is the output for the decoder.
        
        return decoded, encoded

In [74]:
# testing that the autoencoder works.

dummy_vector = textToVectors("The quick brown fox jumped. Over the lazy, brown Dog!", word2vec_model)

print(dummy_vector.shape)

autoencoder = Autoencoder(dummy_vector.shape, 900)
decoded, encoded = autoencoder(torch.tensor(dummy_vector).float())

print(decoded)
print(decoded.shape)

print(encoded)
print(encoded.shape)

reshaped_encoded = encoded.view(3, 300)
print(reshaped_encoded.shape)

# pass the reshaped_encoded to a vector to word to see what words get produced in the compression layer.



(13, 300)
tensor([[-0.0283,  0.0053, -0.0251,  ...,  0.0180, -0.0127,  0.0172],
        [-0.0123, -0.0030, -0.0252,  ...,  0.0038,  0.0143, -0.0114],
        [-0.0005,  0.0066,  0.0160,  ..., -0.0108, -0.0147,  0.0153],
        ...,
        [ 0.0019, -0.0104,  0.0061,  ..., -0.0144, -0.0100, -0.0185],
        [ 0.0156, -0.0135, -0.0190,  ..., -0.0001,  0.0024, -0.0048],
        [-0.0111, -0.0171,  0.0168,  ..., -0.0294, -0.0014,  0.0034]],
       grad_fn=<ViewBackward0>)
torch.Size([13, 300])
tensor([ 2.8519e-02, -2.1042e-02, -1.8555e-02, -5.5807e-03,  1.2468e-02,
         2.2315e-03,  2.0965e-03, -1.2499e-03,  6.0861e-03,  1.5911e-02,
         4.1068e-02, -1.8818e-02, -2.7332e-02, -5.0532e-02,  4.3839e-02,
        -1.9399e-02,  2.3222e-02, -1.2101e-02,  2.7841e-03,  1.4913e-03,
        -1.2275e-03,  3.0557e-02, -1.7421e-02,  2.2639e-02,  1.0563e-02,
         2.1042e-02,  6.2973e-04, -1.8636e-02,  1.7675e-02, -6.6244e-03,
         1.8061e-02, -6.6992e-03,  6.7019e-03,  1.3053e-02,  1.7

  features_flat = torch.tensor(features_flat, dtype=torch.float32)


In [58]:
def vectorsToText(embeddings, word2vec_model):
    """
    Converts a list of embeddings back into words using the Word2Vec model.
    
    Args:
    - embeddings (list of numpy arrays): List of vector embeddings.
    - word2vec_model (Word2Vec): Gensim Word2Vec model.
    
    Returns:
    - words (list of str): List of words corresponding to the embeddings.
    """
    words = []
    for vector in embeddings:
        vector_np = vector.detach().numpy()
        # Normalize the vector to have unit length
        normalized_vector = vector_np / np.linalg.norm(vector_np)
        try:
            # Find the most similar word to the given normalized vector
            similar_word = word2vec_model.most_similar(positive=[normalized_vector], topn=1)[0][0]
            words.append(similar_word)
        except KeyError:
            # If the normalized vector doesn't correspond to any word, append a placeholder
            words.append("UNKNOWN_WORD")
    return words


In [59]:
words = vectorsToText(reshaped_encoded, word2vec_model)

print(words)

['Meiya', 'Leezza', 'Shiias']


In [60]:
import datasets
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [62]:
dataset = datasets.load_dataset("cnn_dailymail", '3.0.0')

Downloading readme: 100%|██████████| 15.6k/15.6k [00:00<00:00, 13.8MB/s]
Downloading data: 100%|██████████| 257M/257M [00:11<00:00, 22.6MB/s] 
Downloading data: 100%|██████████| 257M/257M [00:12<00:00, 20.6MB/s] 
Downloading data: 100%|██████████| 259M/259M [00:13<00:00, 19.8MB/s] 
Downloading data: 100%|██████████| 34.7M/34.7M [00:01<00:00, 20.1MB/s]
Downloading data: 100%|██████████| 30.0M/30.0M [00:01<00:00, 18.3MB/s]
Generating train split: 100%|██████████| 287113/287113 [00:01<00:00, 273195.07 examples/s]
Generating validation split: 100%|██████████| 13368/13368 [00:00<00:00, 260027.44 examples/s]
Generating test split: 100%|██████████| 11490/11490 [00:00<00:00, 260773.75 examples/s]


In [70]:
class MyDataset(Dataset):
    def __init__(self, dataset):
        self.articles = dataset['train']['article']
        self.summaries = dataset['train']['highlights']

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        # Extract the article and summary from the dataset
        article = self.articles[idx]
        summary = self.summaries[idx]
        return article, summary

# Instantiate your custom dataset using only the training set
custom_dataset = MyDataset(dataset)

# Create a PyTorch DataLoader
batch_size = 1
dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

print("Length of DataLoader:", len(dataloader))
print("Length of Custom Dataset:", len(custom_dataset))

Length of DataLoader: 287113
Length of Custom Dataset: 287113


In [71]:
from sklearn.decomposition import PCA

In [72]:
def compressVectorsPCA(data, target_count):
    pca = PCA(n_components=target_count)
    compressed_data = pca.fit_transform(data.T).T
    return compressed_data

In [95]:
def autoencoder_training(model, loss_function, optimizer, train_data, n_epochs, update_interval):
    """
    Function for training an autoencoder model.

    Args:
    - model (nn.Module): Autoencoder model.
    - loss_function: Loss function.
    - optimizer: Optimizer.
    - train_data (DataLoader): DataLoader for the training data.
    - n_epochs (int): Number of epochs for training.
    - update_interval (int): Interval for logging and updating losses.

    Returns:
    - model (nn.Module): Trained autoencoder model.
    - losses (list): List of losses during training.
    """
    losses = []

    for epoch in range(n_epochs):

        epoch_loss = 0.0
        batch_count = 0

        for articles, summaries in train_data:
            
            optimizer.zero_grad()

            # Convert articles to embeddings
            article_embedding = textToVectors(articles[0], word2vec_model)

            compressed_vector = compressVectorsPCA(article_embedding, 13)

            decoded, encoded = model(torch.tensor(compressed_vector).float())

            # Calculate loss
            loss = loss_function(decoded, torch.tensor(compressed_vector).float())

            # Backpropagation
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            batch_count += 1

            # Logging and updating losses
            if batch_count % update_interval == 0:
                losses.append(epoch_loss / batch_count)

        epoch_loss /= len(train_data)
        print(f"Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss:.4f}")

    return model, losses


In [92]:
from torch.utils.data import SubsetRandomSampler

In [93]:
subset_sampler = SubsetRandomSampler(range(100))

# Create a PyTorch DataLoader for the subset
subset_dataloader = DataLoader(custom_dataset, batch_size=batch_size, sampler=subset_sampler)

print("Length of Subset DataLoader:", len(subset_dataloader))

Length of Subset DataLoader: 100


In [96]:
autoencoder = Autoencoder((13,300), 900)

# Define the loss function and optimizer
loss_function = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

# Train the autoencoder
n_epochs = 1
update_interval = 100
trained_autoencoder, losses = autoencoder_training(autoencoder, loss_function, optimizer, subset_dataloader, n_epochs, update_interval)

  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)
  features_flat = torch.tensor(features_flat, dtype=torch.floa

Epoch 1/1, Loss: 0.3930


  features_flat = torch.tensor(features_flat, dtype=torch.float32)


In [104]:
dummy_vector = textToVectors("The quick brown fox jumped. Over the lazy, brown Dog!", word2vec_model)

compressed_vector = compressVectorsPCA(dummy_vector, 13)

print(compressed_vector.shape)

print(vectorsToText(torch.tensor(compressed_vector), word2vec_model))

decoded, encoded = trained_autoencoder(torch.tensor(dummy_vector).float())

print(decoded)
print(decoded.shape)

print(encoded)
print(encoded.shape)

reshaped_encoded = encoded.view(3, 300)
print(reshaped_encoded.shape)

words = vectorsToText(reshaped_encoded, word2vec_model)

print(words)

(13, 300)


  normalized_vector = vector_np / np.linalg.norm(vector_np)
  features_flat = torch.tensor(features_flat, dtype=torch.float32)


['brown', '----------_-----------------------------------------------_GS##', 'dog', 'lazy', 'fox', 'quick', 'ERNEST_DOROSZUK_QMI_AGENCY', 'over', 'Mark_Kornblau_spokesman', 'mythological', 'parapsychologist', 'unleashing_torrents', '</s>']
tensor([[ 0.0326,  0.0605,  0.0561,  ..., -0.0821,  0.0632,  0.0150],
        [-0.0026, -0.0166,  0.0062,  ...,  0.0215,  0.0145,  0.0138],
        [-0.0053,  0.0044, -0.0231,  ...,  0.0009, -0.0107,  0.0188],
        ...,
        [ 0.0173,  0.0100, -0.0143,  ...,  0.0020,  0.0114,  0.0013],
        [ 0.0068,  0.0026,  0.0118,  ...,  0.0084, -0.0032,  0.0110],
        [-0.0324, -0.0039, -0.0061,  ...,  0.0069, -0.0196,  0.0074]],
       grad_fn=<ViewBackward0>)
torch.Size([13, 300])
tensor([-6.7036e-02, -1.5886e-01, -4.2261e-02, -6.0529e-02, -1.5233e-01,
        -7.3305e-02, -8.8234e-02, -1.1837e-01, -1.3985e-01, -7.4195e-02,
        -1.9743e-02, -8.4526e-02, -1.1526e-01, -7.7245e-02,  4.2092e-02,
        -1.1828e-01, -5.8712e-02, -1.0940e-01, -1.351

In [121]:
# testing skip connections

encoder = trained_autoencoder.encoder
decoder = trained_autoencoder.decoder

encoded, out2, out1 = encoder(torch.tensor(dummy_vector).float())

print(encoded.shape)
print(out2.shape)
print(out1.shape)

print(vectorsToText(out2.view(5, 300), word2vec_model))
print(vectorsToText(out1.view(8, 300), word2vec_model))

# passing skipped values to the encoder and seeing if the output is the same as the original encoded value.

result1 = decoder(encoded, 0, 0)
print(result1.shape)
print(vectorsToText(result1, word2vec_model))

zer1 = torch.zeros_like(encoded)
zer2 = torch.zeros_like(out2)
zer3 = torch.zeros_like(out1)

result2 = decoder(zer1, out2, zer3)
print(result2.shape)
print(vectorsToText(result2, word2vec_model))

result3 = decoder(zer1, zer2, out1)
print(result3.shape)
print(vectorsToText(result3, word2vec_model))



  features_flat = torch.tensor(features_flat, dtype=torch.float32)


torch.Size([900])
torch.Size([1500])
torch.Size([2400])
['Korangi', 'BSF_Jawans', 'st_Airborne_Division', 'Korangi', 'Korangi']
['Laflèche', 'wallet_sized_passcards', 'judge_Caroline_Goulborn', 'Joe_Pytka', 'ChopHouse', 'nonhierarchical', 'greywacke', 'Taylor_Housewright']
torch.Size([13, 300])
['K.Kahne_###-###', 'Kerrick_Alumbaugh_pleaded', 'Aftel', 'gnc.com_bodybuilding.com_amazon.com', ',4', 'SHIPPINGPORT_Pa.', 'told', 'Cianci_Corrente', 'Maanzo', 'eating_Veggie_Booty', 'Unique_Motorcars', 'thismonth', 'By_ANNIE_YOUDERIAN']
torch.Size([13, 300])
['henry', 'M.Martin_###-###', 'julia', 'los_Estados_Unidos', 'inks_pact', 'Corp._nasdaq_SYMC', 'Miss_Subasi', 'constitutional_infirmity', 'want', 'Acute_Respiratory_Infections', 'Balou', 'suffering_brain_aneurysm', 'been']
torch.Size([13, 300])
['bolton', 'See_Minn._Stat', 'mart.com', 'supporting_TI_DSPs', 'progra_mme', 'Etsuko_Nomura_mother', 'lingering_animosity', 'BRONWYN_BISHOP', 'Hamdi_Shaqqura', 'Salmonellosis', 'underlay', 'generous_

Points to Consider:

-) Pre-processing the dataset so that we have converted the texts into a numpy array of M words times N dimensions based on which word2vector training model we are trying to use - utilize word_embedding.ipynb .

-) Once we have our trained model, we can pass the encodings of different layers into the appropriate decoding layers and plot how the loss compares relative to the complete process.

-) Finally , once we have trained our model, defining a function that takes our reshaped output words * dimensions vector embedding and converts it into the most similar words using word2vector pre-defined methods.