#Download and import necessary libraries

In [1]:
!pip install datasets torchtext torch==2.3.0 transformers

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting torch==2.3.0
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.3.0)
  Downloading nvidia_c

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#import libraries
import collections
import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm
from torch.utils.data import random_split, DataLoader, Dataset
import pandas as pd
import math
from transformers import AutoTokenizer
import torch.nn.functional as F
import os
import json
from datasets import load_dataset

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Config

In [5]:
#text
max_token_length = 128

#model
d_model = 512
num_layer = 6
factor = 4
n_head = 8

#optim
learning_rate = 2e-5
weight_decay = 1e-3

#training
batch_size = 32
dropout = 0.1

#dim of feed forward layer
d_ff = 2048

seed = 1234

#Load dataset

In [55]:
#load dataset
file_path = "/content/drive/My Drive/Colab Notebooks/DL final"
df = pd.read_excel(file_path + "/final_final_result.xlsx")

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77510 entries, 0 to 77509
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   comment    77510 non-null  object
 1   sentiment  77492 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


#Data Preprocessing

##Comment field

In [37]:
df['comment'] = df['comment'].astype(str)

In [38]:
import re

def remove_tags(string):
    removelist = ''# Add any characters you'd like to keep
    # Remove HTML tags

    result = re.sub(r'<[^>]+>', '', string)

    # Convert to lowercase

    result = result.lower()

    # Remove URLs

    result = re.sub(r'https?://\S+', '', result)

    # Remove non-alphanumeric characters (except for those in the removelist)

    result = re.sub(r'[^a-zA-Z0-9' + removelist + r'\s]', ' ', result)

    return result


In [39]:
# Function to remove rows where comment is only numbers or whitespace
def remove_empty_or_numeric_rows(df):
    df['cleaned_comment'] = df['comment'].apply(remove_tags)
    df = df[df['cleaned_comment'].str.strip().str.match(r'^[0-9\s]+$') == False]
    df = df.drop(columns=['cleaned_comment'])

    return df

In [40]:
def clean_comments_column(df, column_name):
    df[column_name] = df[column_name].astype(str)
    cleaned_df = df[(df[column_name] != 'nan') & (df[column_name] != '')]

    return cleaned_df

In [41]:
# Apply Function to Remove HTML Tags in our Dataset Colum Review.

df['comment'] = df['comment'].apply(remove_tags)

In [42]:
df = remove_empty_or_numeric_rows(df)

In [43]:
df = df.loc[df['comment'].str.strip() != '']

In [44]:
df = clean_comments_column(df, 'comment')

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 115716 entries, 0 to 115718
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   comment    115716 non-null  object
 1   sentiment  115716 non-null  object
dtypes: object(2)
memory usage: 2.6+ MB


In [46]:
df = df.reset_index(drop=True)

##Sentiment field

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77510 entries, 0 to 77509
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   comment    77510 non-null  object
 1   sentiment  77492 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


In [58]:
df['sentiment'].dtype

dtype('O')

In [59]:
df['sentiment'].unique()

array(['neutral', 'positive', 'negative', nan, 'mixed'], dtype=object)

In [60]:
#delete rows have label which is unidentified and have comment unlabeled
df = df[(df['sentiment'] != 'mixed') & (df['sentiment'].notna())]

In [61]:
df['sentiment'].unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 77490 entries, 0 to 77509
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   comment    77490 non-null  object
 1   sentiment  77490 non-null  object
dtypes: object(2)
memory usage: 1.8+ MB


In [63]:
label_counts = df['sentiment'].value_counts()

print(label_counts)

sentiment
negative    38573
neutral     24635
positive    14282
Name: count, dtype: int64


## Augment dataset

In [64]:
yelp_dataset = load_dataset("yelp_review_full")

In [65]:
yelp_reviews = yelp_dataset['train'].to_pandas()

In [66]:
def map_to_sentiment(stars):
    if stars == 3:
        return 'neutral'
    elif stars > 3:
        return 'positive'
    elif stars < 3:
        return 'negative'

In [67]:
yelp_reviews['sentiment'] = yelp_reviews['label'].apply(map_to_sentiment)

In [68]:
yelp_reviews = yelp_reviews.rename(columns={'text': 'comment'})[['comment', 'sentiment']]

In [69]:
negative_count = df[df['sentiment'] == 'negative'].shape[0]
neutral_count = df[df['sentiment'] == 'neutral'].shape[0]
positive_count = df[df['sentiment'] == 'positive'].shape[0]

required_neutral = negative_count - neutral_count
required_positive = negative_count - positive_count

print(required_neutral)
print(required_positive)

13938
24291


In [70]:
neutral_samples = yelp_reviews[yelp_reviews['sentiment'] == 'neutral']
positive_samples = yelp_reviews[yelp_reviews['sentiment'] == 'positive']

In [71]:
neutral_to_add = neutral_samples.sample(required_neutral, random_state=42)
positive_to_add = positive_samples.sample(required_positive, random_state=42)

In [72]:
balanced_df = pd.concat([df, neutral_to_add, positive_to_add], ignore_index=True)

In [73]:
balanced_df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
neutral,38573
positive,38573
negative,38573


In [74]:
df = balanced_df

##Tokenize

In [121]:
#Train: 64%
#Validation: 16%
#Test: 20%
from sklearn.model_selection import train_test_split
temp_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['sentiment'])
train_df, val_df = train_test_split(temp_df, test_size=0.2, random_state=42, stratify=temp_df['sentiment'])


In [122]:
print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Testing set size:", len(test_df))

Training set size: 74060
Validation set size: 18515
Testing set size: 23144


In [123]:
#save if needed
train_df.to_excel(file_path + "/Data/train.xlsx", index=False)
val_df.to_excel(file_path + "/Data/val.xlsx", index=False)
test_df.to_excel(file_path + "/Data/test.xlsx", index=False)

In [124]:
#load pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [125]:
#convert df splits to dictionary-like format
dataset = {
    "train": {"sentence": train_df["comment"].tolist(), "label": train_df["sentiment"].tolist()},
    "validation": {"sentence": val_df["comment"].tolist(), "label": val_df["sentiment"].tolist()},
    "test": {"sentence": test_df["comment"].tolist(), "label": test_df["sentiment"].tolist()},
}

In [126]:
print({key: dataset["train"][key][:5] for key in dataset["train"].keys()})

{'sentence': ['Great staff and burgers. You gotta get the double double, fries, and a black and white shake.', '&quot;Follow the math&quot; when bill is 42 and guy received 50 but somehow it means $7 change', "This is the best Mexican food in York County. Best queso dip I have ever had. Great flavors and service. I go at least once a week. I love how it's right across from Neighborhood.", 'Stayed here for my friends \\"Dirty Thirty\\" birthday weekend. We got one of the suites and it had a \\"sweet\\" view - hahaha! Excuse the lame joke... \\n\\nThe hotel is located at the far end of the strip next to Excalibur and New York New York. There is a tram that takes you over to New York New York but thats as far as it goes. The hotel offers a buffet which was actually pretty good. We took advantage of the Sunday Champagne Brunch. On our last day the hotel front desk was able to give us a late check out of 1:00 PM which was super helpful. Overall the space was clean, beds were comfortable and

In [127]:
def preprocess_function(examples):
    #tokenize the sentences
    tokenized_examples = tokenizer(examples["sentence"], padding='max_length', truncation=True, max_length=max_token_length)

    #map labels to numerical values
    sentiment_mapping = {"negative": 0, "neutral": 1, "positive": 2}
    tokenized_examples["label"] = [sentiment_mapping[label] for label in examples["label"]]
    return tokenized_examples

In [128]:
#tokenize train, validation, and test sets
tokenized_train = preprocess_function(dataset["train"])
tokenized_val = preprocess_function(dataset["validation"])
tokenized_test = preprocess_function(dataset["test"])

In [129]:
print(tokenized_train.keys())

dict_keys(['input_ids', 'attention_mask', 'label'])


In [130]:
print(tokenized_train["label"][:5])

[2, 1, 2, 1, 1]


In [131]:
#calculate vocab size and number of classes
tmp_data = np.array(tokenized_train["input_ids"])
vocab_size = np.max(tmp_data) + 1
tmp_data = None

tmp_data = np.array(dataset["train"]["label"])
num_classes = len(set(tmp_data))
tmp_data = None

In [132]:
#updata customdataset to work with tokenized data
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data["input_ids"]
        self.label = data["label"]
        self.length = len(self.label)

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        torch_data = torch.tensor(self.data[index], dtype=torch.int64)
        torch_label = torch.tensor(self.label[index], dtype=torch.int64)
        return (torch_data, torch_label)

In [133]:
#wrap tokenized data in CustomDataset
torch_dataset_train = CustomDataset(tokenized_train)
torch_dataset_val = CustomDataset(tokenized_val)
torch_dataset_test = CustomDataset(tokenized_test)

In [134]:
#dataLoaders
torch_train_loader = DataLoader(torch_dataset_train, batch_size=batch_size, shuffle=True)
torch_val_loader = DataLoader(torch_dataset_val, batch_size=batch_size, shuffle=True)
torch_test_loader = DataLoader(torch_dataset_test, batch_size=batch_size, shuffle=True)

In [135]:
print(f"Vocabulary size: {vocab_size}")
print(f"Number of classes: {num_classes}")

Vocabulary size: 30258
Number of classes: 3


In [136]:
#verify the first batch of data
for data, label in torch_train_loader:
    print(data.shape)
    print(label.shape)
    break

torch.Size([32, 128])
torch.Size([32])


#Model

In [137]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        #check if d_model must be divisible by n_head
        assert d_model % n_head == 0

        self.d_model = d_model
        self.n_head = n_head
        self.d_k = d_model // n_head

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.n_head, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

        output = self.W_o(self.combine_heads(attn_output))
        return output

In [138]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [139]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


In [140]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_head)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [141]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_head, num_layers, d_ff, max_seq_length, dropout):
        super(TransformerEncoder, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_head = n_head
        self.num_layers = num_layers
        self.d_ff = d_ff
        self.max_seq_length = max_seq_length
        self.dropout = nn.Dropout(dropout)

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, n_head, d_ff, dropout) for _ in range(num_layers)])

        self.positional_embedding = PositionalEncoding(d_model, max_seq_length)

    def masking(self, x):
        x_mask = (x != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2).to(x.device)
        return x_mask

    def forward(self, x):
        x_mask = self.masking(x)

        x = self.embedding(x) * math.sqrt(self.d_model)
        x = self.dropout(self.positional_embedding(x))

        for layer in self.encoder:
          x = layer(x, x_mask)

        x = x.reshape(x.shape[0], -1)

        return x

x = torch.randint(size=(32, 10), low=0, high=1000)

net = TransformerEncoder(vocab_size=1000, d_model=512, n_head=8, num_layers=4, d_ff=2048, max_seq_length=10, dropout=0.1)

print(net(x).shape)

torch.Size([32, 5120])


In [142]:
class TransformerEncoderClassification(nn.Module):
    def __init__(self, vocab_size=1000, d_model=512, n_head=8, num_layers=4, d_ff=2048, max_seq_length=64, dropout=0.1, num_classes=3):
        super(TransformerEncoderClassification, self).__init__()
        self.transformers_encoder = TransformerEncoder(vocab_size, d_model, n_head, num_layers, d_ff, max_seq_length, dropout)
        self.fc1 = nn.Linear(max_seq_length * d_model, d_model)
        self.fc2 = nn.Linear(d_model, 128)
        self.fc3 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.transformers_encoder(x)

        print(x.shape)

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

x =  torch.randint(size=(32, 10), low=1, high=100)
net = TransformerEncoderClassification(vocab_size=1000, d_model=512, n_head=8, num_layers=4, d_ff=2048, max_seq_length=10, dropout=0.1, num_classes=3)
a = net(x)
print(a.shape)

torch.Size([32, 5120])
torch.Size([32, 3])


#Train

In [143]:
BASE_PATH = "/content/drive/My Drive/Colab Notebooks/DL final"

In [144]:
MODEL_SAVE_PATH = os.path.join(BASE_PATH, "encoder_attention.pt")
JSON_SAVE_PATH = os.path.join(BASE_PATH, "encoder_attention.json")

In [145]:
def init_model():
    model = TransformerEncoderClassification(vocab_size=vocab_size,
                                             d_model=d_model,
                                             n_head=n_head,
                                             num_layers=num_layer,
                                             d_ff=d_ff,
                                             max_seq_length=max_token_length,
                                             dropout=dropout,
                                             num_classes=num_classes).to(device=device)

    criterion = nn.CrossEntropyLoss().to(device=device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    return model, criterion, optimizer


In [146]:
# Save the model and optimizer state
def save_model(model, optimizer, epoch, path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, path)

In [147]:
# Load the model and optimizer state
def load(model, optimizer, path):
    checkpoint = torch.load(path, map_location=torch.device(device))
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]
    return model, optimizer, epoch


In [148]:
# Evaluate the model
def summary(loader, model, criterion):
    num_correct = 0
    num_samples = 0
    total_loss = 0
    loss_epoch = 0

    model.eval()

    with torch.no_grad():
        for index, (data, label) in enumerate(loader):
            data = data.to(device=device)
            label = label.to(device=device)

            prob = model(data)
            pred = torch.argmax(prob, dim=1)

            num_correct += (pred == label).sum().item()
            num_samples += pred.shape[0]

            loss = criterion(prob, label)
            loss_epoch += loss.item()

    acc = (num_correct / num_samples) * 100.0
    loss_avg = loss_epoch / len(loader)
    return acc, loss_avg

In [149]:
# Main training loop
def train(train_loader, val_loader, num_epochs, batch_print=50):
    import os
    import matplotlib.pyplot as plt

    train_acc_list = []
    train_loss_list = []
    val_acc_list = []
    val_loss_list = []

    cur_epoch = -1

    # Initialize model, criterion, and optimizer
    model, criterion, optimizer = init_model()

    # Load saved model if exists
    if os.path.exists(MODEL_SAVE_PATH):
        model, optimizer, cur_epoch = load(model, optimizer, path=MODEL_SAVE_PATH)

    for epoch in range(num_epochs):
        # Skip already completed epochs
        if cur_epoch >= epoch:
            continue

        correct_samples = 0
        total_samples = 0
        loss_epoch = 0

        print(f"Epoch {epoch + 1}/{num_epochs}")
        print("----------------------------------------")

        model.train()

        for batch_idx, (data, label) in enumerate(train_loader):
            data = data.to(device=device)
            label = label.to(device=device)

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            prob = model(data)
            pred = torch.argmax(prob, dim=1)

            # Update metrics
            correct_samples += (pred == label).sum().item()
            total_samples += pred.size(0)

            # Compute loss
            loss = criterion(prob, label)
            loss.backward()

            # Clip gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)

            # Step optimizer
            optimizer.step()

            # Accumulate batch loss
            loss_epoch += loss.item()

            # Print metrics every `batch_print` batches
            if (batch_idx + 1) % batch_print == 0:
                batch_accuracy = (correct_samples / total_samples) * 100.0
                print(f"Batch {batch_idx + 1}/{len(train_loader)}: "
                      f"Accuracy: {batch_accuracy:.2f}% | Loss: {loss.item():.4f}")

        # Validation summary after each epoch
        val_acc, val_loss = summary(val_loader, model, criterion)

        # Record metrics
        train_acc_list.append((correct_samples / total_samples) * 100.0)
        train_loss_list.append(loss_epoch / len(train_loader))
        val_acc_list.append(val_acc)
        val_loss_list.append(val_loss)

        # Save model
        save_model(model=model, optimizer=optimizer, epoch=epoch, path=MODEL_SAVE_PATH)

    # Plotting training and validation curves
    plt.figure(figsize=(12, 6))

    # Loss vs Epochs
    plt.subplot(1, 2, 1)
    plt.plot(range(1, num_epochs + 1), train_loss_list, label='Train Loss')
    plt.plot(range(1, num_epochs + 1), val_loss_list, label='Validation Loss')
    plt.title('Loss vs Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    # Accuracy vs Epochs
    plt.subplot(1, 2, 2)
    plt.plot(range(1, num_epochs + 1), train_acc_list, label='Train Accuracy')
    plt.plot(range(1, num_epochs + 1), val_acc_list, label='Validation Accuracy')
    plt.title('Accuracy vs Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy (%)')
    plt.legend()

    plt.tight_layout()
    plt.show()


In [None]:
train(torch_train_loader, torch_val_loader, num_epochs=5, batch_print=50)

Epoch 1/5
----------------------------------------
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([32, 65536])
torch.Size([3