In [1]:
import os
os.chdir("/Users/yenchenchou/Documents/GitHub/recsys-benchmarks")

In [2]:
import random
import polars as pl
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets

from src import EnvInit

In [5]:
env_init = EnvInit()
seed = env_init.fix_seed(12345)

In [7]:
class MyRNN(nn.Module):
    """Near from scratch, need to manually pass the output to
    subsequencial input if we want to train models.

    Args:
        nn (_type_): _description_
    """

    def __init__(self, input_size, hidden_size):
        super(MyRNN, self).__init__()
        self.w_ih = nn.Parameter(torch.randn(hidden_size, input_size))
        self.b_ih = nn.Parameter(torch.zeros(hidden_size))  # Set biases to zero
        self.w_hh = nn.Parameter(torch.randn(hidden_size, hidden_size))
        self.b_hh = nn.Parameter(torch.zeros(hidden_size))  # Set biases to zero

    def forward(self, x, h_0):
        w_ih = x @ self.w_ih.T + self.b_ih
        w_hh = h_0 @ self.w_hh.T + self.b_hh
        output = torch.tanh(w_ih + w_hh)
        return output, w_hh


class MyRNNV2(nn.Module):
    """Use nn.Linear to do the manually pass work

    Args:
        nn (_type_): _description_
    """

    def __init__(self, input_size, hidden_size):
        super(MyRNNV2, self).__init__()
        self.ih = nn.Linear(input_size, hidden_size, bias=False)
        self.hh = nn.Linear(hidden_size, hidden_size, bias=False)

    def forward(self, x, h_0):
        w_ih = self.ih(x)
        w_hh = self.hh(h_0)
        output = torch.tanh(w_ih + w_hh)
        return output, w_hh


class MyRNNV3(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MyRNNV3, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size)

    def forward(self, x, h_0):
        output, w_hh = self.rnn(x, h_0)
        return output, w_hh


# Initialize models
input_size = 2
hidden_size = 4
model_v3 = MyRNNV3(input_size, hidden_size)
model_v2 = MyRNNV2(input_size, hidden_size)
model = MyRNN(input_size, hidden_size)

# Set identical weights and zero biases
model.w_ih.data = model_v2.ih.weight.data.clone()
model.w_hh.data = model_v2.hh.weight.data.clone()

# Test with random input and initial hidden state
# seq_length, batch, hidden_size
x = torch.ones(2, 3, 2)
h_0 = torch.zeros(1, 3, 4)
output_v3, _ = model_v3(x, h_0)
output_v2, _ = model_v2(x, h_0)
output, _ = model(x, h_0)

print("Output from MyRNNV3:", output_v3)
print("Output from MyRNNV2:", output_v2)
print("Output from MyRNN:", output)

Output from MyRNNV3: tensor([[[-0.7878, -0.0865,  0.0545,  0.1137],
         [-0.7878, -0.0865,  0.0545,  0.1137],
         [-0.7878, -0.0865,  0.0545,  0.1137]],

        [[-0.8129,  0.2851, -0.1545,  0.3939],
         [-0.8129,  0.2851, -0.1545,  0.3939],
         [-0.8129,  0.2851, -0.1545,  0.3939]]], grad_fn=<StackBackward0>)
Output from MyRNNV2: tensor([[[ 0.1696,  0.7143,  0.5727, -0.7502],
         [ 0.1696,  0.7143,  0.5727, -0.7502],
         [ 0.1696,  0.7143,  0.5727, -0.7502]],

        [[ 0.1696,  0.7143,  0.5727, -0.7502],
         [ 0.1696,  0.7143,  0.5727, -0.7502],
         [ 0.1696,  0.7143,  0.5727, -0.7502]]], grad_fn=<TanhBackward0>)
Output from MyRNN: tensor([[[ 0.1696,  0.7143,  0.5727, -0.7502],
         [ 0.1696,  0.7143,  0.5727, -0.7502],
         [ 0.1696,  0.7143,  0.5727, -0.7502]],

        [[ 0.1696,  0.7143,  0.5727, -0.7502],
         [ 0.1696,  0.7143,  0.5727, -0.7502],
         [ 0.1696,  0.7143,  0.5727, -0.7502]]], grad_fn=<TanhBackward0>)


In [14]:
type(model.w_hh)

torch.nn.parameter.Parameter

In [12]:
model.w_hh

Parameter containing:
tensor([[-0.2846,  0.2336, -0.4755, -0.2369],
        [ 0.1993,  0.4679,  0.3254,  0.2068],
        [-0.3531,  0.1518, -0.1821,  0.0290],
        [-0.0640,  0.4679,  0.3262,  0.0523]], requires_grad=True)

In [15]:
type(model.w_hh.data)

torch.Tensor

In [13]:
model.w_hh.data

tensor([[-0.2846,  0.2336, -0.4755, -0.2369],
        [ 0.1993,  0.4679,  0.3254,  0.2068],
        [-0.3531,  0.1518, -0.1821,  0.0290],
        [-0.0640,  0.4679,  0.3262,  0.0523]])

In [16]:
class MyModule(nn.Module):
    def __init__(self, weights):
        super(MyModule, self).__init__()

        self.linear = nn.Linear(weights.shape[1], weights.shape[0])
        with torch.no_grad():
            self.linear.weight.copy_(weights)
        
    def forward(self, x):
        x = self.linear(x)
        return x

weights = torch.randn(10, 10)
model = MyModule(weights)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

out = model(torch.randn(1, 10))
out.mean().backward()

for name, param in model.named_parameters():
    print(name, param.grad)

w0 = model.linear.weight.clone()
optimizer.step()
w1 = model.linear.weight.clone()
print(w1 - w0)

linear.weight tensor([[-0.0881,  0.1248,  0.0115,  0.0387,  0.2470,  0.0856,  0.0598,  0.0848,
          0.1373,  0.0611],
        [-0.0881,  0.1248,  0.0115,  0.0387,  0.2470,  0.0856,  0.0598,  0.0848,
          0.1373,  0.0611],
        [-0.0881,  0.1248,  0.0115,  0.0387,  0.2470,  0.0856,  0.0598,  0.0848,
          0.1373,  0.0611],
        [-0.0881,  0.1248,  0.0115,  0.0387,  0.2470,  0.0856,  0.0598,  0.0848,
          0.1373,  0.0611],
        [-0.0881,  0.1248,  0.0115,  0.0387,  0.2470,  0.0856,  0.0598,  0.0848,
          0.1373,  0.0611],
        [-0.0881,  0.1248,  0.0115,  0.0387,  0.2470,  0.0856,  0.0598,  0.0848,
          0.1373,  0.0611],
        [-0.0881,  0.1248,  0.0115,  0.0387,  0.2470,  0.0856,  0.0598,  0.0848,
          0.1373,  0.0611],
        [-0.0881,  0.1248,  0.0115,  0.0387,  0.2470,  0.0856,  0.0598,  0.0848,
          0.1373,  0.0611],
        [-0.0881,  0.1248,  0.0115,  0.0387,  0.2470,  0.0856,  0.0598,  0.0848,
          0.1373,  0.0611],
     

In [18]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2', cache_dir="data")
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2', cache_dir="data")

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Sentence embeddings:
tensor([[ 6.7657e-02,  6.3496e-02,  4.8713e-02,  7.9305e-02,  3.7448e-02,
          2.6528e-03,  3.9375e-02, -7.0984e-03,  5.9361e-02,  3.1537e-02,
          6.0098e-02, -5.2905e-02,  4.0607e-02, -2.5931e-02,  2.9843e-02,
          1.1268e-03,  7.3515e-02, -5.0382e-02, -1.2239e-01,  2.3703e-02,
          2.9727e-02,  4.2477e-02,  2.5634e-02,  1.9952e-03, -5.6919e-02,
         -2.7160e-02, -3.2904e-02,  6.6025e-02,  1.1901e-01, -4.5879e-02,
         -7.2621e-02, -3.2584e-02,  5.2341e-02,  4.5055e-02,  8.2530e-03,
          3.6702e-02, -1.3941e-02,  6.5392e-02, -2.6427e-02,  2.0637e-04,
         -1.3664e-02, -3.6281e-02, -1.9504e-02, -2.8974e-02,  3.9427e-02,
         -8.8409e-02,  2.6243e-03,  1.3671e-02,  4.8306e-02, -3.1157e-02,
         -1.1733e-01, -5.1169e-02, -8.8529e-02, -2.1896e-02,  1.4299e-02,
          4.4417e-02, -1.3482e-02,  7.4339e-02,  2.6638e-02, -1.9876e-02,
          1.7919e-02, -1.0605e-02, -9.0426e-02,  2.1327e-02,  1.4120e-01,
         -6.4718e

In [21]:
model_output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0366, -0.0162,  0.1682,  ...,  0.0554, -0.1644, -0.2967],
         [ 0.7239,  0.6399,  0.1888,  ...,  0.5946,  0.6206,  0.4897],
         [ 0.0064,  0.0203,  0.0448,  ...,  0.3464,  1.3170, -0.1670],
         ...,
         [ 0.1479, -0.0643,  0.1457,  ...,  0.8837, -0.3316,  0.2975],
         [ 0.5212,  0.6563,  0.5607,  ..., -0.0399,  0.0412, -1.4036],
         [ 1.0824,  0.7140,  0.3986,  ..., -0.2301,  0.3243, -1.0313]],

        [[ 0.2802,  0.1165, -0.0418,  ...,  0.2711, -0.1685, -0.2961],
         [ 0.8729,  0.4545, -0.1091,  ...,  0.1365,  0.4580, -0.2042],
         [ 0.4752,  0.5731,  0.6304,  ...,  0.6526,  0.5612, -1.3268],
         ...,
         [ 0.6113,  0.7920, -0.4685,  ...,  0.0854,  1.0592, -0.2983],
         [ 0.4115,  1.0946,  0.2385,  ...,  0.8984,  0.3684, -0.7333],
         [ 0.1374,  0.5555,  0.2678,  ...,  0.5426,  0.4665, -0.5284]]]), pooler_output=tensor([[ 1.3429e-02,  4.0036e-02,  3.

In [23]:
type(model_output)

transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions

In [30]:
for name, param in model.named_parameters():
    if name in ['bias']:
        print(param.size())

In [28]:
model_output.__dict__

{'last_hidden_state': tensor([[[ 0.0366, -0.0162,  0.1682,  ...,  0.0554, -0.1644, -0.2967],
          [ 0.7239,  0.6399,  0.1888,  ...,  0.5946,  0.6206,  0.4897],
          [ 0.0064,  0.0203,  0.0448,  ...,  0.3464,  1.3170, -0.1670],
          ...,
          [ 0.1479, -0.0643,  0.1457,  ...,  0.8837, -0.3316,  0.2975],
          [ 0.5212,  0.6563,  0.5607,  ..., -0.0399,  0.0412, -1.4036],
          [ 1.0824,  0.7140,  0.3986,  ..., -0.2301,  0.3243, -1.0313]],
 
         [[ 0.2802,  0.1165, -0.0418,  ...,  0.2711, -0.1685, -0.2961],
          [ 0.8729,  0.4545, -0.1091,  ...,  0.1365,  0.4580, -0.2042],
          [ 0.4752,  0.5731,  0.6304,  ...,  0.6526,  0.5612, -1.3268],
          ...,
          [ 0.6113,  0.7920, -0.4685,  ...,  0.0854,  1.0592, -0.2983],
          [ 0.4115,  1.0946,  0.2385,  ...,  0.8984,  0.3684, -0.7333],
          [ 0.1374,  0.5555,  0.2678,  ...,  0.5426,  0.4665, -0.5284]]]),
 'pooler_output': tensor([[ 1.3429e-02,  4.0036e-02,  3.0797e-03,  7.7095e-03, 

In [27]:
model_output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0366, -0.0162,  0.1682,  ...,  0.0554, -0.1644, -0.2967],
         [ 0.7239,  0.6399,  0.1888,  ...,  0.5946,  0.6206,  0.4897],
         [ 0.0064,  0.0203,  0.0448,  ...,  0.3464,  1.3170, -0.1670],
         ...,
         [ 0.1479, -0.0643,  0.1457,  ...,  0.8837, -0.3316,  0.2975],
         [ 0.5212,  0.6563,  0.5607,  ..., -0.0399,  0.0412, -1.4036],
         [ 1.0824,  0.7140,  0.3986,  ..., -0.2301,  0.3243, -1.0313]],

        [[ 0.2802,  0.1165, -0.0418,  ...,  0.2711, -0.1685, -0.2961],
         [ 0.8729,  0.4545, -0.1091,  ...,  0.1365,  0.4580, -0.2042],
         [ 0.4752,  0.5731,  0.6304,  ...,  0.6526,  0.5612, -1.3268],
         ...,
         [ 0.6113,  0.7920, -0.4685,  ...,  0.0854,  1.0592, -0.2983],
         [ 0.4115,  1.0946,  0.2385,  ...,  0.8984,  0.3684, -0.7333],
         [ 0.1374,  0.5555,  0.2678,  ...,  0.5426,  0.4665, -0.5284]]]), pooler_output=tensor([[ 1.3429e-02,  4.0036e-02,  3.

In [31]:
model_output[0], model_output[0].shape

(tensor([[[ 0.0366, -0.0162,  0.1682,  ...,  0.0554, -0.1644, -0.2967],
          [ 0.7239,  0.6399,  0.1888,  ...,  0.5946,  0.6206,  0.4897],
          [ 0.0064,  0.0203,  0.0448,  ...,  0.3464,  1.3170, -0.1670],
          ...,
          [ 0.1479, -0.0643,  0.1457,  ...,  0.8837, -0.3316,  0.2975],
          [ 0.5212,  0.6563,  0.5607,  ..., -0.0399,  0.0412, -1.4036],
          [ 1.0824,  0.7140,  0.3986,  ..., -0.2301,  0.3243, -1.0313]],
 
         [[ 0.2802,  0.1165, -0.0418,  ...,  0.2711, -0.1685, -0.2961],
          [ 0.8729,  0.4545, -0.1091,  ...,  0.1365,  0.4580, -0.2042],
          [ 0.4752,  0.5731,  0.6304,  ...,  0.6526,  0.5612, -1.3268],
          ...,
          [ 0.6113,  0.7920, -0.4685,  ...,  0.0854,  1.0592, -0.2983],
          [ 0.4115,  1.0946,  0.2385,  ...,  0.8984,  0.3684, -0.7333],
          [ 0.1374,  0.5555,  0.2678,  ...,  0.5426,  0.4665, -0.5284]]]),
 torch.Size([2, 7, 384]))

In [26]:
model_output.last_hidden_state

tensor([[[ 0.0366, -0.0162,  0.1682,  ...,  0.0554, -0.1644, -0.2967],
         [ 0.7239,  0.6399,  0.1888,  ...,  0.5946,  0.6206,  0.4897],
         [ 0.0064,  0.0203,  0.0448,  ...,  0.3464,  1.3170, -0.1670],
         ...,
         [ 0.1479, -0.0643,  0.1457,  ...,  0.8837, -0.3316,  0.2975],
         [ 0.5212,  0.6563,  0.5607,  ..., -0.0399,  0.0412, -1.4036],
         [ 1.0824,  0.7140,  0.3986,  ..., -0.2301,  0.3243, -1.0313]],

        [[ 0.2802,  0.1165, -0.0418,  ...,  0.2711, -0.1685, -0.2961],
         [ 0.8729,  0.4545, -0.1091,  ...,  0.1365,  0.4580, -0.2042],
         [ 0.4752,  0.5731,  0.6304,  ...,  0.6526,  0.5612, -1.3268],
         ...,
         [ 0.6113,  0.7920, -0.4685,  ...,  0.0854,  1.0592, -0.2983],
         [ 0.4115,  1.0946,  0.2385,  ...,  0.8984,  0.3684, -0.7333],
         [ 0.1374,  0.5555,  0.2678,  ...,  0.5426,  0.4665, -0.5284]]])