In [1]:
from transformers import T5Tokenizer, T5Model
import torch
from torch import nn

# Inisialisasi tokenizer dan model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5Model.from_pretrained('t5-small')

# Token ID yang ingin dicek
token_ids = torch.tensor([510, 511, 511, 508, 510, 508, 508, 510, 511, 511]).unsqueeze(0)  # Tambahkan batch dimension

# Dapatkan embedding
with torch.no_grad():
    outputs = model.encoder.embed_tokens(token_ids)
    print("Embedding outputs:")
    print(outputs)  # Menampilkan embedding untuk token IDs

print("=====================================")
# Proyeksi ke dimensi 4
linear_projection = nn.Linear(outputs.size(-1), 4)
projected_embeddings = linear_projection(outputs)

print("Projected embeddings:")
print(projected_embeddings)  # Embedding dengan dimensi 4


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Embedding outputs:
tensor([[[ -9.0625,  -3.7656,  -2.5156,  ...,  -7.6562,  15.9375,  20.1250],
         [ 11.6875,   3.5781,  18.8750,  ..., -11.2500,  -9.8125,   3.2344],
         [ 11.6875,   3.5781,  18.8750,  ..., -11.2500,  -9.8125,   3.2344],
         ...,
         [ -9.0625,  -3.7656,  -2.5156,  ...,  -7.6562,  15.9375,  20.1250],
         [ 11.6875,   3.5781,  18.8750,  ..., -11.2500,  -9.8125,   3.2344],
         [ 11.6875,   3.5781,  18.8750,  ..., -11.2500,  -9.8125,   3.2344]]])
Projected embeddings:
tensor([[[16.4911, -7.7782, -7.2378,  6.0241],
         [ 0.3229,  0.2022, -3.6999, -7.6459],
         [ 0.3229,  0.2022, -3.6999, -7.6459],
         [ 6.4316,  0.1444, -4.9361, -2.1015],
         [16.4911, -7.7782, -7.2378,  6.0241],
         [ 6.4316,  0.1444, -4.9361, -2.1015],
         [ 6.4316,  0.1444, -4.9361, -2.1015],
         [16.4911, -7.7782, -7.2378,  6.0241],
         [ 0.3229,  0.2022, -3.6999, -7.6459],
         [ 0.3229,  0.2022, -3.6999, -7.6459]]], grad_fn=<

In [3]:
# Fungsi Positional Encoding dengan dimensi 4

import numpy as np


def positional_encoding(position, d_model):
    angle_rads = np.arange(position)[:, np.newaxis] / np.power(10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])
    pos_encoding = np.concatenate([sines, cosines], axis=-1)
    return torch.tensor(pos_encoding[:, :d_model], dtype=torch.float32)  # Hanya ambil dimensi 4

# Contoh penggunaan
seq_length = token_ids.shape[1]
pos_enc = positional_encoding(position=seq_length, d_model=4)

print("Positional encoding (dim 4):")
print(pos_enc)

Positional encoding (dim 4):
tensor([[ 0.0000,  0.0000,  1.0000,  1.0000],
        [ 0.8415,  0.0100,  0.5403,  0.9999],
        [ 0.9093,  0.0200, -0.4161,  0.9998],
        [ 0.1411,  0.0300, -0.9900,  0.9996],
        [-0.7568,  0.0400, -0.6536,  0.9992],
        [-0.9589,  0.0500,  0.2837,  0.9988],
        [-0.2794,  0.0600,  0.9602,  0.9982],
        [ 0.6570,  0.0699,  0.7539,  0.9976],
        [ 0.9894,  0.0799, -0.1455,  0.9968],
        [ 0.4121,  0.0899, -0.9111,  0.9960]])


In [4]:
import torch
import torch.nn.functional as F

# Tambahkan Positional Encoding ke Projected Embeddings
projected_embeddings_with_pos = projected_embeddings + pos_enc.unsqueeze(0)

batch_size, seq_length, d_model = projected_embeddings_with_pos.size()

# Inisialisasi bobot untuk Q, K, V (dimensi 4x4)
w_q = nn.Linear(d_model, d_model, bias=False)
w_k = nn.Linear(d_model, d_model, bias=False)
w_v = nn.Linear(d_model, d_model, bias=False)

# Hitung Q, K, V
Q = w_q(projected_embeddings_with_pos)
K = w_k(projected_embeddings_with_pos)
V = w_v(projected_embeddings_with_pos)

# Scaled Dot-Product Attention
def scaled_dot_product_attention(Q, K, V):
    d_k = Q.size(-1)
    scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
    attention_weights = F.softmax(scores, dim=-1)
    output = torch.matmul(attention_weights, V)
    return output, attention_weights

# Hitung Self-Attention
attention_output, attention_weights = scaled_dot_product_attention(Q, K, V)

print("Attention Output:")
print(attention_output)

print("Attention Weights:")
print(attention_weights)


Attention Output:
tensor([[[-1.0876,  1.6066, -2.7136, -0.2860],
         [-0.9635,  1.7421, -2.6983, -0.0204],
         [-1.0080,  1.6875, -2.7078, -0.1422],
         [-1.0568,  1.6246, -2.7222, -0.2939],
         [-1.1092,  1.5945, -2.7068, -0.2760],
         [-1.0172,  1.6491, -2.7314, -0.2973],
         [-1.0090,  1.6545, -2.7333, -0.2973],
         [-1.0970,  1.6014, -2.7107, -0.2815],
         [-0.9976,  1.6998, -2.7058, -0.1156],
         [-1.0151,  1.6806, -2.7077, -0.1522]]], grad_fn=<UnsafeViewBackward0>)
Attention Weights:
tensor([[[2.8517e-28, 6.6965e-02, 1.3702e-01, 8.8998e-08, 6.0971e-27,
          2.8756e-07, 4.1631e-08, 1.1199e-28, 1.0769e-01, 6.8833e-01],
         [3.3426e-06, 1.0159e-01, 1.7548e-01, 2.3967e-02, 3.0431e-05,
          4.9891e-02, 1.1903e-02, 1.5449e-06, 1.3139e-01, 5.0574e-01],
         [3.2668e-07, 9.3034e-02, 1.7083e-01, 1.2669e-02, 3.9210e-06,
          2.9606e-02, 5.8738e-03, 1.3566e-07, 1.2350e-01, 5.6448e-01],
         [1.2360e-13, 8.5434e-02, 1.6

In [5]:
import torch
import torch.nn.functional as F

# Contoh input manual untuk projected_embeddings (batch_size=1, seq_length=10, d_model=4)
manual_projected_embeddings = torch.tensor([[
    [-0.54163,	0.05354,	0.87951,	-0.21727],
    [0.64158,	0.73192,	-0.20439,	0.38998],
    [0.64158,	0.73192,	-0.20439,	0.38998],
    [-0.37968,	0.56734,	0.96329,	0.34976],
    [-0.54163,	0.05354,	0.87951,	-0.21727],
    [-0.37968,	0.56734,	0.96329,	0.34976],
    [-0.37968,	0.56734,	0.96329,	0.34976],
    [-0.54163,	0.05354,	0.87951,	-0.21727]
]], dtype=torch.float32)

# Contoh input manual untuk pos_enc (seq_length=10, d_model=4)
manual_pos_enc = torch.tensor([
    [0.0, 1.0, 0.0, 1.0],
    [0.8415, 0.5403, 0.01, 0.99995],
    [0.9093, -0.4161, 0.02, 0.9998],
    [0.1411, -0.99, 0.03, 0.9995],
    [-0.7568, -0.6536, 0.04, 0.9992],
    [-0.9589, 0.2837, 0.05, 0.99875],
    [-0.2794, 0.9602, 0.06, 0.9982],
    [0.657, 0.7539, 0.07, 0.99755]
], dtype=torch.float32)

# Tambahkan Positional Encoding ke Projected Embeddings
projected_embeddings_with_pos = manual_projected_embeddings + manual_pos_enc.unsqueeze(0)

# Inisialisasi bobot untuk Q, K, V (d_model=4)
w_q = torch.nn.Linear(4, 4, bias=False)
w_k = torch.nn.Linear(4, 4, bias=False)
w_v = torch.nn.Linear(4, 4, bias=False)

# Random inisialisasi bobot (untuk mereplikasi perhitungan manual, tetapkan nilai spesifik)
torch.manual_seed(0)
w_q.weight.data.uniform_(-0.1, 0.1)
w_k.weight.data.uniform_(-0.1, 0.1)
w_v.weight.data.uniform_(-0.1, 0.1)

# Hitung Q, K, V
Q = w_q(projected_embeddings_with_pos)
K = w_k(projected_embeddings_with_pos)
V = w_v(projected_embeddings_with_pos)

# Scaled Dot-Product Attention
def scaled_dot_product_attention(Q, K, V):
    d_k = Q.size(-1)
    scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
    attention_weights = F.softmax(scores, dim=-1)
    output = torch.matmul(attention_weights, V)
    return output, attention_weights

# Hitung Self-Attention
attention_output, attention_weights = scaled_dot_product_attention(Q, K, V)

print("Attention Output:")
print(attention_output)

print("Attention Weights:")
print(attention_weights)


Attention Output:
tensor([[[-0.1749,  0.1441, -0.1272,  0.0869],
         [-0.1751,  0.1439, -0.1273,  0.0864],
         [-0.1749,  0.1439, -0.1271,  0.0868],
         [-0.1745,  0.1442, -0.1270,  0.0875],
         [-0.1744,  0.1443, -0.1270,  0.0877],
         [-0.1749,  0.1442, -0.1272,  0.0871],
         [-0.1750,  0.1441, -0.1273,  0.0867],
         [-0.1748,  0.1440, -0.1271,  0.0870]]], grad_fn=<UnsafeViewBackward0>)
Attention Weights:
tensor([[[0.1249, 0.1256, 0.1253, 0.1248, 0.1241, 0.1249, 0.1254, 0.1250],
         [0.1252, 0.1265, 0.1259, 0.1242, 0.1233, 0.1240, 0.1252, 0.1258],
         [0.1250, 0.1257, 0.1255, 0.1248, 0.1242, 0.1244, 0.1250, 0.1254],
         [0.1245, 0.1245, 0.1247, 0.1257, 0.1252, 0.1257, 0.1253, 0.1244],
         [0.1244, 0.1242, 0.1245, 0.1258, 0.1255, 0.1261, 0.1254, 0.1241],
         [0.1246, 0.1254, 0.1251, 0.1250, 0.1241, 0.1255, 0.1257, 0.1245],
         [0.1249, 0.1260, 0.1255, 0.1246, 0.1235, 0.1249, 0.1256, 0.1251],
         [0.1249, 0.1254, 0.1

In [6]:
import torch
import torch.nn.functional as F

# Contoh nilai manual untuk Q, K, dan V (batch_size=1, seq_length=10, d_model=4)
manual_Q = torch.tensor([[
    [-0.8271741358,	-0.4473737201	,-0.1990115814	,0.9793545112],
    [-2.100874688,	-2.485958248	,-0.565244542	,1.721133579],
    [-1.230773659,	-1.938179401	,-0.9498562648	,1.496905623],
    [0.5144164915,	0.1266056896	,-1.04429923	,1.116860739],
    [1.317814715	 ,   1.362316101	,-0.2696524529	,0.2781227762],
    [0.0940285007,	0.3925437502	,0.1478104924	,1.056014863],
    [-1.063412928,	-0.7184953314	,-0.108434038	,1.490215993],
    [-1.09268619	 ,   -0.9507493102	,-0.8114003037	,1.185122719]
]], dtype=torch.float32)

manual_K = torch.tensor([[
    [0.0377749364	,-0.1641118435	,0.3524523188	,1.074776842 ],
    [2.17950715	    ,-0.7541049303	,-0.0753102742	,0.9171164008],
    [2.029081955	    ,-1.06142199	,0.8538343315	,0.3785391614],
    [-0.1718228584	,-0.5201197879	,2.255573135	,0.5279247974],
    [-0.773297439	,-0.2821082053	,2.240955121	,0.1969217446],
    [-0.6138309323	,0.3659621834	,1.344578588	,1.309497533],
    [-0.0784432281	,0.2364181243	,0.4556403745	,1.660645486],
    [0.3330865066	,-0.5524778762	,0.3919968475	,0.931139843]
]], dtype=torch.float32)

manual_V = torch.tensor([[
    [-1.194609282	,0.4435478073	,0.0269536717	,-0.7814393057],
    [0.6912948894	,0.4351609871	,-0.6019309471	,0.1599279749],
    [0.5374930809	,-0.4033286063	,-0.8513755243	,0.3570918432],
    [-1.562628739	,-0.7370527331	,-0.5455516675	,-0.0945005508],
    [-2.097548046	,-0.3853457318	,0.2409020575	,-0.9469289598],
    [-2.097679945	,1.090307844	,0.5265317096	,-0.9391367871],
    [-1.479237209	,1.134933923	,0.1383826489	,-0.6422500574],
    [-0.8159628809	,-0.3007714649	,-0.5737610414	,-0.3254933775]
]], dtype=torch.float32)

# Scaled Dot-Product Attention
def scaled_dot_product_attention(Q, K, V):
    d_k = Q.size(-1)
    scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
    attention_weights = F.softmax(scores, dim=-1)
    output = torch.matmul(attention_weights, V)
    return output, attention_weights

# Hitung Attention dengan input manual
attention_output, attention_weights = scaled_dot_product_attention(manual_Q, manual_K, manual_V)

print("Attention Output:")
print(attention_output)

print("Attention Weights:")
print(attention_weights)


Attention Output:
tensor([[[-1.2809,  0.3143, -0.0739, -0.5383],
         [-1.3620,  0.2744, -0.0682, -0.5714],
         [-1.0980,  0.3086, -0.1497, -0.4849],
         [-0.5932,  0.3605, -0.2805, -0.2971],
         [-0.4532,  0.2758, -0.3395, -0.2153],
         [-1.1137,  0.3325, -0.1227, -0.4674],
         [-1.3515,  0.3571, -0.0406, -0.5681],
         [-1.1968,  0.3628, -0.0942, -0.5277]]])
Attention Weights:
tensor([[[0.1496, 0.0680, 0.0543, 0.1119, 0.1159, 0.1769, 0.1893, 0.1341],
         [0.1683, 0.0364, 0.0302, 0.1191, 0.1258, 0.1596, 0.1858, 0.1747],
         [0.1826, 0.0943, 0.0599, 0.0789, 0.0713, 0.1214, 0.1964, 0.1955],
         [0.1423, 0.2723, 0.1171, 0.0360, 0.0262, 0.0845, 0.1862, 0.1354],
         [0.1056, 0.3003, 0.1806, 0.0517, 0.0392, 0.0891, 0.1375, 0.0960],
         [0.1302, 0.1144, 0.0862, 0.1037, 0.0886, 0.1707, 0.1924, 0.1138],
         [0.1477, 0.0532, 0.0410, 0.1126, 0.1113, 0.1948, 0.2093, 0.1301],
         [0.1787, 0.0795, 0.0498, 0.0793, 0.0813, 0.1524, 0.

In [15]:
# Embedding untuk token IDs & positional encoding

# Contoh input manual untuk projected_embeddings (batch_size=1, seq_length=10, d_model=4)
manual_projected_embeddings = torch.tensor([[
    [-0.54163,	0.05354,	0.87951,	-0.21727],
    [0.64158,	0.73192,	-0.20439,	0.38998],
    [0.64158,	0.73192,	-0.20439,	0.38998],
    [-0.37968,	0.56734,	0.96329,	0.34976],
    [-0.54163,	0.05354,	0.87951,	-0.21727],
    [-0.37968,	0.56734,	0.96329,	0.34976],
    [-0.37968,	0.56734,	0.96329,	0.34976],
    [-0.54163,	0.05354,	0.87951,	-0.21727]
]], dtype=torch.float32)

# Contoh input manual untuk pos_enc (seq_length=10, d_model=4)
manual_pos_enc = torch.tensor([
    [0.0, 1.0, 0.0, 1.0],
    [0.8415, 0.5403, 0.01, 0.99995],
    [0.9093, -0.4161, 0.02, 0.9998],
    [0.1411, -0.99, 0.03, 0.9995],
    [-0.7568, -0.6536, 0.04, 0.9992],
    [-0.9589, 0.2837, 0.05, 0.99875],
    [-0.2794, 0.9602, 0.06, 0.9982],
    [0.657, 0.7539, 0.07, 0.99755]
], dtype=torch.float32)

# Tambahkan Positional Encoding ke Projected Embeddings
projected_embeddings_with_pos = manual_projected_embeddings + manual_pos_enc.unsqueeze(0)

print("Projected Embeddings with Positional Encoding:", projected_embeddings_with_pos)

Projected Embeddings with Positional Encoding: tensor([[[-0.5416,  1.0535,  0.8795,  0.7827],
         [ 1.4831,  1.2722, -0.1944,  1.3899],
         [ 1.5509,  0.3158, -0.1844,  1.3898],
         [-0.2386, -0.4227,  0.9933,  1.3493],
         [-1.2984, -0.6001,  0.9195,  0.7819],
         [-1.3386,  0.8510,  1.0133,  1.3485],
         [-0.6591,  1.5275,  1.0233,  1.3480],
         [ 0.1154,  0.8074,  0.9495,  0.7803]]])


In [16]:
import torch
import torch.nn.functional as F


# Inisialisasi bobot dan bias untuk Q, K, V secara manual (d_model=4)
manual_weights_q = torch.tensor([
    [-0.73925	,-0.96466	,-0.23601	,0.29734 ],
    [-0.99099	,-0.64466	,-0.1603	,-0.10428],
    [-0.70426	,0.34239    ,-0.93681	,-0.03499],
    [0.36942	    ,0.265	    ,0.4271	    ,0.64976]
], dtype=torch.float32)



manual_bias_q = torch.tensor([-0.23643,-0.08234,-0.08986,0.01603], dtype=torch.float32)

manual_weights_k = torch.tensor([
    [0.60899	    ,0.19185	,-0.82426	,-0.03031],
    [-0.47491	,0.28685	,-0.06882	,0.38688],
    [-0.32036	,-0.99314	,0.11239	,0.92019],
    [-0.05038	,0.56384	,0.41918	,0.44664]
], dtype=torch.float32)

manual_bias_k = torch.tensor([0.91417,-0.96584,0.40614,-0.26481], dtype=torch.float32)

manual_weights_v = torch.tensor([
    [0.7139	,   0.2054	    ,-0.58277	,-0.37923],
    [-0.73308,	0.81554	    ,-0.86505	,0.60696],
    [-0.75809,	0.19903	    ,-0.7651    ,0.06726],
    [0.59016	,   -0.15989	,0.42657    ,0.3945]
], dtype=torch.float32)

manual_bias_v = torch.tensor([-0.21495,-0.52698,0.02693,-0.9773], dtype=torch.float32)

# Buat fungsi untuk menghitung Q, K, V secara manual
def linear_transform(x, weight, bias):
    return torch.matmul(x, weight.transpose(-2, -1)) + bias

# Hitung Q, K, V dengan nilai manual
Q = linear_transform(projected_embeddings_with_pos, manual_weights_q, manual_bias_q)
K = linear_transform(projected_embeddings_with_pos, manual_weights_k, manual_bias_k)
V = linear_transform(projected_embeddings_with_pos, manual_weights_v, manual_bias_v)

# Scaled Dot-Product Attention
def scaled_dot_product_attention(Q, K, V):
    d_k = Q.size(-1)
    scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
    attention_weights = F.softmax(scores, dim=-1)
    output = torch.matmul(attention_weights, V)
    return output, attention_weights

# Hitung Self-Attention
attention_output, attention_weights = scaled_dot_product_attention(Q, K, V)

print("Attention Output:")
print(attention_output)

print("Attention Weights:")
print(attention_weights)


Attention Output:
tensor([[[-1.2809,  0.3143, -0.0739, -0.5383],
         [-1.3620,  0.2744, -0.0683, -0.5714],
         [-1.0981,  0.3086, -0.1497, -0.4849],
         [-0.5932,  0.3605, -0.2805, -0.2971],
         [-0.4533,  0.2758, -0.3396, -0.2153],
         [-1.1137,  0.3325, -0.1227, -0.4674],
         [-1.3515,  0.3571, -0.0406, -0.5681],
         [-1.1968,  0.3628, -0.0942, -0.5277]]])
Attention Weights:
tensor([[[0.1496, 0.0680, 0.0543, 0.1119, 0.1159, 0.1769, 0.1893, 0.1341],
         [0.1683, 0.0364, 0.0302, 0.1191, 0.1258, 0.1596, 0.1858, 0.1748],
         [0.1825, 0.0942, 0.0599, 0.0789, 0.0713, 0.1214, 0.1964, 0.1955],
         [0.1423, 0.2723, 0.1171, 0.0360, 0.0262, 0.0845, 0.1862, 0.1354],
         [0.1056, 0.3003, 0.1806, 0.0517, 0.0392, 0.0891, 0.1375, 0.0960],
         [0.1302, 0.1144, 0.0862, 0.1037, 0.0886, 0.1707, 0.1924, 0.1138],
         [0.1477, 0.0532, 0.0410, 0.1126, 0.1113, 0.1948, 0.2093, 0.1301],
         [0.1787, 0.0795, 0.0498, 0.0793, 0.0813, 0.1524, 0.

In [21]:
import torch
import torch.nn.functional as F

# 1. Residual Connection
residual_output = attention_output + projected_embeddings_with_pos
print("Residual Connection Output:")
print(residual_output)

# 2. Layer Normalization
# Menggunakan LayerNorm dengan epsilon kecil seperti pada perhitungan manual
# Pastikan normalized_shape adalah (4,) karena kita ingin menghitung rata-rata per fitur
layer_norm = torch.nn.LayerNorm(normalized_shape=(4,), eps=1e-10)

# Apply Layer Normalization
normalized_output = layer_norm(residual_output)
print("Layer Normalization Output:")
print(normalized_output)


Residual Connection Output:
tensor([[[-1.8225,  1.3678,  0.8056,  0.2444],
         [ 0.1211,  1.5466, -0.2627,  0.8185],
         [ 0.4528,  0.6244, -0.3341,  0.9049],
         [-0.8318, -0.0621,  0.7128,  1.0522],
         [-1.7517, -0.3243,  0.5800,  0.5667],
         [-2.4523,  1.1835,  0.8906,  0.8812],
         [-2.0106,  1.8847,  0.9827,  0.7799],
         [-1.0814,  1.1703,  0.8553,  0.2525]]])
Layer Normalization Output:
tensor([[[-1.6353,  1.0112,  0.5448,  0.0793],
         [-0.6293,  1.4339, -1.1847,  0.3801],
         [ 0.0887,  0.4618, -1.6220,  1.0715],
         [-1.4413, -0.3843,  0.6798,  1.1458],
         [-1.5982, -0.0967,  0.8544,  0.8405],
         [-1.7263,  0.7083,  0.5122,  0.5058],
         [-1.6601,  1.0123,  0.3934,  0.2543],
         [-1.6005,  1.0099,  0.6447, -0.0541]]],
       grad_fn=<NativeLayerNormBackward0>)


In [1]:
import torch
import torch.nn.functional as F
from torch import nn

# Data input dari Layer Normalization dengan dimensi 4 (contoh data dari gambar)
normalized_output = torch.tensor([
    [-1.6353,  1.0112,  0.5448,  0.0793],
    [-0.6293,  1.4339, -1.1847,  0.3801],
    [ 0.0887,  0.4618, -1.6220,  1.0715],
    [-1.4413, -0.3843,  0.6798,  1.1458],
    [-1.5982, -0.0967,  0.8544,  0.8405],
    [-1.7263,  0.7083,  0.5122,  0.5058],
    [-1.6601,  1.0123,  0.3934,  0.2543],
    [-1.6005,  1.0099,  0.6447, -0.0541]
], dtype=torch.float32)

# Inisialisasi Linear Layer dengan dimensi sesuai dari gambar
linear1 = nn.Linear(in_features=4, out_features=3, bias=True)  # Input 4, output 3
linear2 = nn.Linear(in_features=3, out_features=4, bias=True)  # Input 3, output 4

# Menetapkan bobot dan bias untuk Linear Layer 1 dari gambar
manual_weights1 = torch.tensor([
    [-0.28604, -0.29261, -0.2221, 0.30325],
    [-0.261, 0.04918, 0.18259, -0.23019],
    [0.19887, -0.38437, -0.32183, -0.47807]
], dtype=torch.float32)

manual_bias1 = torch.tensor([-0.36804, -0.39214, 0.07934], dtype=torch.float32)

linear1.weight.data = manual_weights1
linear1.bias.data = manual_bias1

# Menetapkan bobot dan bias untuk Linear Layer 2 dari gambar
manual_weights2 = torch.tensor([
    [0.19303	,0.11128	,   -0.47795],
    [-0.20505,	-0.10522,	0.44491],
    [-0.38888,	-0.41619,	0.09184],
    [0.48059	,-0.18764	,0.12954]
], dtype=torch.float32)

manual_bias2 = torch.tensor([-0.43819, 0.3698, -0.30859, -0.0521], dtype=torch.float32)

linear2.weight.data = manual_weights2
linear2.bias.data = manual_bias2

# Definisikan Feed Forward Network (FFN) dengan nn.Linear()
def feed_forward_network(x):
    x = F.relu(linear1(x))  # Linear Layer 1 + ReLU
    x = linear2(x)          # Linear Layer 2
    return x

# Mengaplikasikan FFN ke normalized_output
ffn_output = feed_forward_network(normalized_output)

print("FFN Output (Final Output):")
print(ffn_output)


FFN Output (Final Output):
tensor([[-0.4198,  0.3524, -0.3775, -0.0832],
        [-0.4382,  0.3698, -0.3086, -0.0521],
        [-0.4080,  0.3377, -0.3695,  0.0232],
        [-0.3700,  0.2974, -0.4459,  0.1176],
        [-0.4030,  0.3324, -0.3796,  0.0356],
        [-0.4304,  0.3624, -0.3379, -0.0653],
        [-0.4266,  0.3588, -0.3520, -0.0717],
        [-0.4153,  0.3482, -0.3941, -0.0906]], grad_fn=<AddmmBackward0>)
