# Fetch Assessment

In [1]:
from utils import *
from dataset import *

In [2]:
dataset = TaskDataset(data=None)

Using cache found in C:\Users\yashv/.cache\torch\hub\huggingface_pytorch-transformers_main


# Task 1
Sentence Embedding Model, architecture:
 - Embedding Layer 
 - Positional Encoding
 - Transformer Encoder Layer
 - Mean Pooling
 - Linear Layer (generates embedding)

In [3]:
# initialize models
n_layers = 1 # The number of transformer encoder layers
d_model = 512 # The input embedding dimension
embed_size = 300 # The output embedding size
nhead = 8 # The number of attention heads in encoder layer
task_type = "se" # The type of task (sentence embedding or sentiment classification)
vocab_size = dataset.vocab_size # The size of the vocabulary (using hugging face bert tokenizer)
base_model, se_task_model = init_model(vocab_size, n_layers, d_model, nhead, embed_size, task_type)

In [4]:
# Generate sample input
input_data = torch.randint(0, dataset.vocab_size, (2, 100))

# Forward pass
base_output = base_model(input_data)
task_output = se_task_model(base_output)

print("Base model output:", base_output.shape)
print("Task model output:", task_output.shape)

# Generate input based on text
text1 = "How, are you?"
text2 = "I am good"

input_data1 = dataset.encode(text1)
input_data2 = dataset.encode(text2)

# Forward pass
base_output1 = base_model(input_data1)
base_output2 = base_model(input_data2)
task_output1 = se_task_model(base_output1)
task_output2 = se_task_model(base_output2)

print("Base model output 1:", base_output1.shape)
print("Base model output 2:", base_output2.shape)
print("Task model output 1:", task_output1.shape)   
print("Task model output 2:", task_output2.shape)
print("Task model output 2:", task_output2)

Base model output: torch.Size([2, 100, 512])
Task model output: torch.Size([2, 300])
Base model output 1: torch.Size([1, 7, 512])
Base model output 2: torch.Size([1, 5, 512])
Task model output 1: torch.Size([1, 300])
Task model output 2: torch.Size([1, 300])
Task model output 2: tensor([[ 2.9115e-02,  3.7576e-03,  1.6875e-02, -4.3329e-02, -2.3840e-02,
         -3.0817e-02,  4.4260e-02,  4.8819e-02, -5.7285e-02, -5.8882e-02,
         -4.9020e-02, -1.0446e-01,  1.0870e-03, -1.1216e-02, -3.4832e-02,
         -4.2444e-02, -1.3106e-02, -6.3622e-03, -6.6139e-02,  2.4911e-02,
         -2.6597e-02,  8.5434e-02,  8.7531e-02,  6.6500e-02,  6.1909e-02,
         -9.4715e-02,  6.7242e-02, -5.1707e-02, -5.5728e-02, -5.6037e-03,
         -3.4221e-02,  8.2079e-02, -3.9183e-03, -5.8341e-02, -2.9514e-02,
          2.6526e-02,  8.2539e-02,  9.0273e-02,  6.7931e-03,  2.2955e-02,
          3.4709e-02,  7.3479e-02, -7.0017e-02,  4.3275e-02,  9.1688e-02,
         -6.9128e-02,  1.9761e-02, -7.2405e-02,  8.211

# Task 2
Sentiment Classification Model, architecture:
 - Embedding Layer
 - Positional Encoding
 - Transformer Encoder Layer
 - Mean Pooling
 - Linear Layer (3 classes, Positive, Negative, Neutral) ( can be trained using cross entropy loss)

In [5]:
task_type = "sc"
base_model, sc_task_model = init_model(vocab_size, n_layers, d_model, nhead, embed_size, task_type)

In [6]:
# Generate sample input
input_data = torch.randint(0, dataset.vocab_size, (2, 100))

# Forward pass
base_output = base_model(input_data)
task_output = sc_task_model(base_output)
task_output = nn.Softmax(dim=1)(task_output).detach().numpy()

print("Base model output:", base_output.shape)
print("Task model output:", task_output.shape)
print("Task model output:", task_output)

# Generate input based on text
text1 = "How, are you?"
text2 = "I am good"

input_data1 = dataset.encode(text1)
input_data2 = dataset.encode(text2)

# Forward pass
base_output1 = base_model(input_data1)
base_output2 = base_model(input_data2)
task_output1 = sc_task_model(base_output1)
task_output2 = sc_task_model(base_output2)

task_output1 = nn.Softmax(dim=1)(task_output1).detach().numpy()
task_output2 = nn.Softmax(dim=1)(task_output2).detach().numpy()

print("Base model output 1:", base_output1.shape)
print("Base model output 2:", base_output2.shape)
print("Task model output 1:", task_output1.shape)   
print("Task model output 2:", task_output2.shape)

Base model output: torch.Size([2, 100, 512])
Task model output: (2, 3)
Task model output: [[0.32383433 0.38363796 0.2925277 ]
 [0.32741138 0.39954418 0.27304447]]
Base model output 1: torch.Size([1, 7, 512])
Base model output 2: torch.Size([1, 5, 512])
Task model output 1: (1, 3)
Task model output 2: (1, 3)


# Task 4

Assuming we are training the sentence embedding model, and the sentiment classification model, simulatenously. Then each model can have different learning rates, as well for embedding in transformer will add different learning rate.

* Embedding Layer: Set a learning rate of 1e-3.
  
  -  This is a low-level layer responsible for capturing foundational features, so a finer learning rate helps prevent drastic changes that could drift away from good initial representations.
* Transformer Layer: Use a learning rate of 5e-3.
   
   - As a middle layer, this part of the model focuses on generalizing patterns from the dataset. A slightly higher learning rate allows it to adapt faster, helping it develop better generalizations over time.
* Sentence Embedding and Sentiment Classification Layers: Set a learning rate of 1e-2.
  - These layers capture task-specific, high-level features crucial for task understanding. A higher learning rate here allows faster learning of these details, which is important to avoid long convergence times.

In [7]:
print(base_model.named_parameters)

<bound method Module.named_parameters of BaseModel(
  (embed): Embedding(30522, 512)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
    (linear1): Linear(in_features=512, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=512, bias=True)
    (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)


In [8]:
import torch.optim as optim

In [9]:
parameters = []
base_model_transformer_lr = 5e-3
base_model_embedding_lr = 1e-3

for name, param in base_model.named_parameters():
    if "embed" in name:
        parameters.append({"params": param, "lr": base_model_embedding_lr})
    else:
        parameters.append({"params": param, "lr": base_model_transformer_lr})

se_lr = 1e-2
sc_lr = 1e-2

parameters.append(
    {"params" : se_task_model.parameters(), "lr" : se_lr}
)

parameters.append(
    {"params" : sc_task_model.parameters(), "lr" : sc_lr}
)

optimizer = optim.Adam(parameters)