In [1]:
import pandas as pd
import re
import math
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

import sentencepiece as spm

from tqdm import tqdm
import random

file = "amazon_review.csv"

data = pd.read_csv(file).dropna(ignore_index=True)
data['overall'] = data["overall"] - 1

vocab_size = 16000
seq_len = 512
pad_token = 16008

spm.SentencePieceTrainer.train(
    input='amazon_reviews.txt',
    model_prefix='amazon_reviews',
    vocab_size=vocab_size,
    model_type='unigram',
    character_coverage=1.0
)

tok = spm.SentencePieceProcessor(model_file='amazon_reviews.model')

filter_ = 1

ls  = [len(tok.encode(i, out_type=int)) for i in data["reviewText"]]

data["lengths"] = ls

data_trunc = data[data["lengths"]<=30]

v = data_trunc["overall"].value_counts()

balanced_data = (
    data_trunc.groupby("overall")
      .sample(n=min(v), random_state=42)
      .reset_index(drop=True)
)

len(balanced_data)

df_shuffled = balanced_data.sample(frac=filter_, random_state=42).reset_index(drop=True)

# Split the shuffled DataFrame
train_size = 0.8
train_df = df_shuffled.sample(frac=train_size, random_state=42).reset_index(drop=True)
test_df = df_shuffled.drop(train_df.index).reset_index(drop=True)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: amazon_reviews.txt
  input_format: 
  model_prefix: amazon_reviews
  model_type: UNIGRAM
  vocab_size: 16000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0

In [2]:
print("len train df: ", len(train_df))
print("len test df: ", len(test_df))

len train df:  9200
len test df:  2300


In [3]:
torch.cuda.empty_cache()

In [4]:
# get one input output pair with special tokens for later concatenation with other pairs for one sequence under max sequence length
def getEncodingOpen(df, i):
    reviewtext = "Review: "+ df["reviewText"].iloc[i]
    rating = "Rating: " 
    #row = [16000] + tok.encode(reviewText, out_type=int) + [16001] + tok.encode([int(df["overall"].iloc[i])], out_type = int)
    row = [16000] + tok.encode(reviewtext, out_type = int) + [16002] + tok.encode(rating, out_type = int)
    correct_output_rating = int(df["overall"].iloc[i])
    row = torch.LongTensor(row)
    correct_output_rating = torch.LongTensor([correct_output_rating])
    return row, correct_output_rating
# get one input output pair with special tokens for later concatenation with other pairs for one sequence under max sequence length
def getEncoding(df, i):
    reviewtext = "Review: "+ df["reviewText"].iloc[i]
    rating = "Rating: " #+ str(int(df["overall"].iloc[i]))
    score = int(df["overall"].iloc[i])
    row = [16000] + tok.encode(reviewtext, out_type = int) + [16002] + tok.encode(rating, out_type = int) +[score+16003]+ [16001]
    return row


In [27]:
d = {16000: "<BOS>", 16001: "<EOS>",16002: "<SEP>",16003: "<0>",16004: "<1>",16005: "<2>",16006: "<3>",16007: "<4>", 16008: "PAD"}

def decode_seq(seq):
    outp = ""
    sofar = []
    for i in seq:
        if(i<=15999):
            sofar.append(i)
        else:
            outp += tok.decode(sofar)
            outp += d[i]
            sofar = []
    outp += tok.decode(sofar)
    return outp


In [28]:
decode_seq(getEncoding(train_df, 2))

'<BOS>Review: The legs are to long. Ordered a 30" inseam. A person with a 33" inseam could wear these with no problem.<SEP>Rating:<2><EOS>'

In [29]:
train_df["overall"].iloc[2]

np.float64(2.0)

In [30]:
print(seq_len)

512


In [31]:
def getShiftSeq(df_t, max_seq=1024):
    seqs_x = []
    seqs_y = []
    seqs = []
    c = []
    for i in range(len(df_t)):
        row = getEncoding(df_t, i)
        if len(c) + len(row) > max_seq +1:
            seqs_x.append(c[:-1])
            seqs_y.append(c[1:])
            seqs.append(c)
            c = []
        c.extend(row)
    return seqs_x, seqs_y, seqs

train_seqs_x, train_seqs_y, train_seqs = getShiftSeq(train_df, max_seq=seq_len)
test_seqs_x, test_seqs_y, test_seqs = getShiftSeq(test_df, max_seq=seq_len)

In [32]:
len(train_seqs_x[0])

508

In [33]:
len(train_seqs[0])

509

In [34]:
decode_seq(train_seqs[0][-15:])

'good products, but they have become complete crap.<SEP>Rating:<0><EOS>'

In [35]:
train_seqs_x[0][-15:]

[238, 47, 686, 6, 23, 22, 27, 1097, 2032, 2912, 3, 16002, 13254, 292, 16003]

In [36]:
train_seqs[0][-15:]

[47, 686, 6, 23, 22, 27, 1097, 2032, 2912, 3, 16002, 13254, 292, 16003, 16001]

In [37]:
class TokenDatasetB(Dataset):
    def __init__(self, seqs_x, seqs_y):
        self.seqs_x = seqs_x
        self.seqs_y = seqs_y
    
    def __len__(self):
        return len(self.seqs_x)
    
    def __getitem__(self, idx):
        return torch.LongTensor(self.seqs_x[idx]), torch.LongTensor(self.seqs_y[idx])

# Padding collate function for variable length sequences
def collate_fnB(batch):
    seqs_x, seqs_y = zip(*batch)
    lens = [len(s) for s in seqs_x]
    max_len = max(lens)
    
    # Pad sequences
    padded_x = torch.zeros(len(seqs_x), max_len, dtype=torch.long) + pad_token
    padded_y = torch.zeros(len(seqs_y), max_len, dtype=torch.long) + pad_token
    
    for i, (x, y) in enumerate(zip(seqs_x, seqs_y)):
        padded_x[i, :len(x)] = x
        padded_y[i, :len(y)] = y
    
    return padded_x, padded_y, torch.LongTensor(lens)

dataset = TokenDatasetB(train_seqs_x, train_seqs_y)
train_loader = DataLoader(dataset, batch_size=5, shuffle=True, collate_fn=collate_fnB)

In [38]:
for batch_idx, (x, y, lengths) in enumerate(train_loader):
    print("bi")
    print(batch_idx)
    print("x")
    print(x.shape)
    xl = x[0].tolist()
    print(xl)
    print()
    
    
    print(decode_seq(x[0].tolist()))
    print()
    print(decode_seq(x[1].tolist()))
    print()
    print(decode_seq(x[2].tolist()))
    print()
    print(decode_seq(x[3].tolist()))
    print()
    print(decode_seq(x[4].tolist()))
    print()
    print("y")
    print(y)
    print(y.shape)
    print("lengths")
    print(lengths.shape)
    break

bi
0
x
torch.Size([5, 510])
[16000, 7421, 292, 39, 971, 521, 5344, 52, 4871, 6, 4, 27, 127, 123, 24, 11, 177, 1508, 101, 7, 24, 19, 5, 157, 25, 19, 36, 5344, 52, 4871, 3, 16002, 13254, 292, 16004, 16001, 16000, 7421, 292, 1480, 36, 334, 57, 9, 432, 41, 7, 1111, 71, 60, 6, 628, 733, 186, 818, 233, 22, 63, 7069, 3, 16002, 13254, 292, 16004, 16001, 16000, 7421, 292, 815, 303, 215, 3, 39, 26, 352, 9, 46, 60, 177, 5, 2621, 3, 1225, 371, 151, 184, 3, 16002, 13254, 292, 16005, 16001, 16000, 7421, 292, 39, 44, 58, 46, 452, 3, 109, 13, 36, 18, 4496, 3, 1110, 327, 6, 244, 7, 634, 3, 16002, 13254, 292, 16004, 16001, 16000, 7421, 292, 96, 50, 63, 314, 92, 214, 63, 163, 11, 651, 16002, 13254, 292, 16007, 16001, 16000, 7421, 292, 4, 111, 10, 29, 223, 415, 8969, 432, 75, 276, 124, 908, 10, 131, 241, 2875, 3, 16002, 13254, 292, 16005, 16001, 16000, 7421, 292, 139, 82, 148, 3, 4, 43, 14, 34, 16002, 13254, 292, 16006, 16001, 16000, 7421, 292, 109, 8, 8, 2070, 415, 7, 645, 43, 5, 523, 1395, 3, 431, 46, 7

In [39]:
if torch.cuda.is_available():
    print("CUDA is available!")
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"Current CUDA device name: {torch.cuda.get_device_name(0)}")
    device = 'cuda'
else:
    print("CUDA is not available. PyTorch will use the CPU.")
    device = 'cpu'

CUDA is available!
CUDA device count: 1
Current CUDA device name: NVIDIA GeForce RTX 4080 SUPER


In [40]:

def save_checkpoint(model, optimizer, step, path):
    ckpt = {
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "step": step,
    }
    torch.save(ckpt, path)


In [41]:
class GPTBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super().__init__()

        self.ln1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(
            embed_dim, num_heads, dropout=dropout, batch_first=True
        )

        self.ln2 = nn.LayerNorm(embed_dim)

        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout),
        )

    def forward(self, x, causal_mask, padding_mask):
        # Self-attention (GPT-style)
        h = self.ln1(x)
        attn_out, _ = self.attn(
            h, h, h,
            attn_mask=causal_mask,
            key_padding_mask=padding_mask,
            need_weights=False
        )
        x = x + attn_out

        # Feedforward
        h = self.ln2(x)
        ff_out = self.mlp(h)
        x = x + ff_out

        return x
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, max_len=512,
                 embed_dim=768, num_heads=12,
                 num_layers=4, mlp_dim=3072, dropout=0.25):
        super().__init__()

        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.pos = nn.Embedding(max_len, embed_dim)

        self.layers = nn.ModuleList([
            GPTBlock(embed_dim, num_heads, mlp_dim, dropout)
            for _ in range(num_layers)
        ])

        self.ln_final = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size, bias=False)
        #self.head.weight = self.embed.weight  # weight tying

    #def causal_mask(self, T, device):
    #    mask = torch.triu(torch.ones(T, T, device=device), 1)
    #    return mask * float("-inf")

    def causal_mask(self, T, device): 
        mask = torch.triu(torch.ones(T, T, device=device), 1) 
        return mask.masked_fill(mask == 1, float('-inf'))



    
    def forward(self, x, lengths=None):
        B, T = x.shape
        device = x.device

        tok = self.embed(x)
        pos = self.pos(torch.arange(T, device=device)[None, :])
        h = tok + pos

        causal = self.causal_mask(T, device)     # (T, T)
        pad_mask = (x == 0)                      # (B, T)

        for layer in self.layers:
            h = layer(h, causal, pad_mask)

        h = self.ln_final(h)
        return self.head(h)                      # (B, T, V)


In [42]:
len(train_loader)

80

In [43]:
# Training function
def train_model(model, train_loader, epochs=10, lr=1e-4, device='cuda'):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    track_loss = []
    global_step = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        loader = tqdm(train_loader)
        
        for x, y, lengths in loader:
            x, y = x.to(device), y.to(device)
            lengths = lengths.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            logits = model(x, lengths)
            
            # Reshape for loss calculation
            logits = logits.view(-1, logits.size(-1))
            y = y.view(-1)
            
            loss = criterion(logits, y)
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            total_loss += loss.item()
            track_loss.append(loss.item())
            avg_loss = sum(track_loss[-10:]) / 10
            loader.set_postfix(loss=avg_loss)
            del logits
            torch.cuda.empty_cache()
            global_step+=1

            ckpt_path = f"ckpt_{epoch}.pt"
            if global_step % 2500 == 0:
                save_checkpoint(model, optimizer, global_step, ckpt_path)
                print(f"Saved checkpoint at step {global_step}")
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}')
    
    return model




In [44]:
dataset = TokenDatasetB(train_seqs_x, train_seqs_y)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fnB)


# Initialize model (set vocab_size to your tokenizer's vocab size + special tokens)
vocab_size = 16000+9  # Adjust based on your tokenizer
model = DecoderOnlyTransformer(vocab_size=vocab_size,  num_layers=12)

In [45]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")

Total parameters: 110039040


In [46]:
# Train
model = train_model(model, train_loader, epochs=75, lr=1e-4)


100%|██████████████████████████████| 100/100 [00:10<00:00,  9.12it/s, loss=3.68]


Epoch 1/75, Average Loss: 4.4267


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.06it/s, loss=3.45]


Epoch 2/75, Average Loss: 3.4482


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.06it/s, loss=3.22]


Epoch 3/75, Average Loss: 3.2060


100%|██████████████████████████████| 100/100 [00:10<00:00,  9.10it/s, loss=3.01]


Epoch 4/75, Average Loss: 3.0220


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.06it/s, loss=2.83]


Epoch 5/75, Average Loss: 2.8571


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.06it/s, loss=2.72]


Epoch 6/75, Average Loss: 2.7025


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.05it/s, loss=2.57]


Epoch 7/75, Average Loss: 2.5425


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.08it/s, loss=2.43]


Epoch 8/75, Average Loss: 2.3742


100%|███████████████████████████████| 100/100 [00:11<00:00,  9.03it/s, loss=2.2]


Epoch 9/75, Average Loss: 2.1897


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.06it/s, loss=2.05]


Epoch 10/75, Average Loss: 2.0066


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.06it/s, loss=1.86]


Epoch 11/75, Average Loss: 1.8225


100%|███████████████████████████████| 100/100 [00:11<00:00,  9.02it/s, loss=1.7]


Epoch 12/75, Average Loss: 1.6510


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.00it/s, loss=1.51]


Epoch 13/75, Average Loss: 1.4820


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.07it/s, loss=1.38]


Epoch 14/75, Average Loss: 1.3258


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.02it/s, loss=1.25]


Epoch 15/75, Average Loss: 1.1761


100%|███████████████████████████████| 100/100 [00:11<00:00,  9.01it/s, loss=1.1]


Epoch 16/75, Average Loss: 1.0333


100%|█████████████████████████████| 100/100 [00:11<00:00,  8.99it/s, loss=0.957]


Epoch 17/75, Average Loss: 0.9009


100%|█████████████████████████████| 100/100 [00:11<00:00,  9.04it/s, loss=0.822]


Epoch 18/75, Average Loss: 0.7818


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.03it/s, loss=0.71]


Epoch 19/75, Average Loss: 0.6698


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.01it/s, loss=0.61]


Epoch 20/75, Average Loss: 0.5694


100%|█████████████████████████████| 100/100 [00:11<00:00,  9.01it/s, loss=0.511]


Epoch 21/75, Average Loss: 0.4798


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.00it/s, loss=0.43]


Epoch 22/75, Average Loss: 0.4049


100%|█████████████████████████████| 100/100 [00:11<00:00,  9.06it/s, loss=0.366]


Epoch 23/75, Average Loss: 0.3374


100%|█████████████████████████████| 100/100 [00:11<00:00,  9.04it/s, loss=0.301]


Epoch 24/75, Average Loss: 0.2819


100%|█████████████████████████████| 100/100 [00:11<00:00,  8.35it/s, loss=0.254]


Saved checkpoint at step 2500
Epoch 25/75, Average Loss: 0.2367


100%|█████████████████████████████| 100/100 [00:10<00:00,  9.18it/s, loss=0.217]


Epoch 26/75, Average Loss: 0.2002


100%|█████████████████████████████| 100/100 [00:11<00:00,  9.05it/s, loss=0.183]


Epoch 27/75, Average Loss: 0.1704


100%|█████████████████████████████| 100/100 [00:10<00:00,  9.10it/s, loss=0.154]


Epoch 28/75, Average Loss: 0.1409


100%|██████████████████████████████| 100/100 [00:10<00:00,  9.18it/s, loss=0.13]


Epoch 29/75, Average Loss: 0.1234


100%|█████████████████████████████| 100/100 [00:10<00:00,  9.13it/s, loss=0.113]


Epoch 30/75, Average Loss: 0.1032


100%|████████████████████████████| 100/100 [00:11<00:00,  9.03it/s, loss=0.0921]


Epoch 31/75, Average Loss: 0.0874


100%|████████████████████████████| 100/100 [00:11<00:00,  8.99it/s, loss=0.0849]


Epoch 32/75, Average Loss: 0.0786


100%|█████████████████████████████| 100/100 [00:11<00:00,  9.01it/s, loss=0.074]


Epoch 33/75, Average Loss: 0.0682


100%|████████████████████████████| 100/100 [00:11<00:00,  9.00it/s, loss=0.0713]


Epoch 34/75, Average Loss: 0.0639


100%|████████████████████████████| 100/100 [00:11<00:00,  9.03it/s, loss=0.0639]


Epoch 35/75, Average Loss: 0.0598


100%|█████████████████████████████| 100/100 [00:11<00:00,  8.92it/s, loss=0.062]


Epoch 36/75, Average Loss: 0.0564


100%|████████████████████████████| 100/100 [00:11<00:00,  8.99it/s, loss=0.0549]


Epoch 37/75, Average Loss: 0.0537


100%|████████████████████████████| 100/100 [00:11<00:00,  9.03it/s, loss=0.0541]


Epoch 38/75, Average Loss: 0.0498


100%|████████████████████████████| 100/100 [00:11<00:00,  8.99it/s, loss=0.0522]


Epoch 39/75, Average Loss: 0.0466


100%|████████████████████████████| 100/100 [00:11<00:00,  8.98it/s, loss=0.0484]


Epoch 40/75, Average Loss: 0.0462


100%|████████████████████████████| 100/100 [00:11<00:00,  9.02it/s, loss=0.0526]


Epoch 41/75, Average Loss: 0.0464


100%|█████████████████████████████| 100/100 [00:11<00:00,  9.01it/s, loss=0.059]


Epoch 42/75, Average Loss: 0.0481


100%|████████████████████████████| 100/100 [00:11<00:00,  9.04it/s, loss=0.0532]


Epoch 43/75, Average Loss: 0.0494


100%|████████████████████████████| 100/100 [00:11<00:00,  9.03it/s, loss=0.0516]


Epoch 44/75, Average Loss: 0.0472


100%|████████████████████████████| 100/100 [00:11<00:00,  9.00it/s, loss=0.0492]


Epoch 45/75, Average Loss: 0.0462


100%|████████████████████████████| 100/100 [00:11<00:00,  9.01it/s, loss=0.0458]


Epoch 46/75, Average Loss: 0.0428


100%|████████████████████████████| 100/100 [00:11<00:00,  9.05it/s, loss=0.0455]


Epoch 47/75, Average Loss: 0.0404


100%|████████████████████████████| 100/100 [00:11<00:00,  9.00it/s, loss=0.0456]


Epoch 48/75, Average Loss: 0.0404


100%|████████████████████████████| 100/100 [00:11<00:00,  9.00it/s, loss=0.0447]


Epoch 49/75, Average Loss: 0.0414


100%|████████████████████████████| 100/100 [00:11<00:00,  8.35it/s, loss=0.0445]


Saved checkpoint at step 5000
Epoch 50/75, Average Loss: 0.0393


100%|████████████████████████████| 100/100 [00:11<00:00,  9.02it/s, loss=0.0416]


Epoch 51/75, Average Loss: 0.0398


100%|████████████████████████████| 100/100 [00:11<00:00,  8.99it/s, loss=0.0436]


Epoch 52/75, Average Loss: 0.0392


100%|████████████████████████████| 100/100 [00:11<00:00,  9.00it/s, loss=0.0453]


Epoch 53/75, Average Loss: 0.0396


100%|████████████████████████████| 100/100 [00:11<00:00,  8.97it/s, loss=0.0395]


Epoch 54/75, Average Loss: 0.0386


100%|████████████████████████████| 100/100 [00:11<00:00,  9.00it/s, loss=0.0424]


Epoch 55/75, Average Loss: 0.0368


100%|█████████████████████████████| 100/100 [00:11<00:00,  8.99it/s, loss=0.039]


Epoch 56/75, Average Loss: 0.0377


100%|████████████████████████████| 100/100 [00:11<00:00,  8.99it/s, loss=0.0426]


Epoch 57/75, Average Loss: 0.0369


100%|████████████████████████████| 100/100 [00:11<00:00,  8.94it/s, loss=0.0413]


Epoch 58/75, Average Loss: 0.0380


100%|████████████████████████████| 100/100 [00:11<00:00,  8.98it/s, loss=0.0407]


Epoch 59/75, Average Loss: 0.0365


100%|████████████████████████████| 100/100 [00:10<00:00,  9.14it/s, loss=0.0392]


Epoch 60/75, Average Loss: 0.0356


100%|████████████████████████████| 100/100 [00:10<00:00,  9.19it/s, loss=0.0364]


Epoch 61/75, Average Loss: 0.0351


100%|████████████████████████████| 100/100 [00:10<00:00,  9.11it/s, loss=0.0389]


Epoch 62/75, Average Loss: 0.0348


100%|████████████████████████████| 100/100 [00:10<00:00,  9.10it/s, loss=0.0372]


Epoch 63/75, Average Loss: 0.0351


100%|████████████████████████████| 100/100 [00:11<00:00,  9.09it/s, loss=0.0367]


Epoch 64/75, Average Loss: 0.0363


100%|█████████████████████████████| 100/100 [00:10<00:00,  9.11it/s, loss=0.037]


Epoch 65/75, Average Loss: 0.0337


100%|████████████████████████████| 100/100 [00:10<00:00,  9.15it/s, loss=0.0332]


Epoch 66/75, Average Loss: 0.0327


100%|█████████████████████████████| 100/100 [00:10<00:00,  9.12it/s, loss=0.033]


Epoch 67/75, Average Loss: 0.0313


100%|████████████████████████████| 100/100 [00:10<00:00,  9.13it/s, loss=0.0324]


Epoch 68/75, Average Loss: 0.0308


100%|████████████████████████████| 100/100 [00:10<00:00,  9.12it/s, loss=0.0352]


Epoch 69/75, Average Loss: 0.0316


100%|█████████████████████████████| 100/100 [00:11<00:00,  9.09it/s, loss=0.036]


Epoch 70/75, Average Loss: 0.0325


100%|████████████████████████████| 100/100 [00:10<00:00,  9.10it/s, loss=0.0369]


Epoch 71/75, Average Loss: 0.0333


100%|████████████████████████████| 100/100 [00:10<00:00,  9.16it/s, loss=0.0342]


Epoch 72/75, Average Loss: 0.0325


100%|█████████████████████████████| 100/100 [00:10<00:00,  9.11it/s, loss=0.036]


Epoch 73/75, Average Loss: 0.0308


100%|██████████████████████████████| 100/100 [00:11<00:00,  9.08it/s, loss=0.03]


Epoch 74/75, Average Loss: 0.0297


100%|████████████████████████████| 100/100 [00:11<00:00,  8.47it/s, loss=0.0365]

Saved checkpoint at step 7500
Epoch 75/75, Average Loss: 0.0307





In [304]:
model.eval()

DecoderOnlyTransformer(
  (embed): Embedding(16009, 768, padding_idx=0)
  (pos): Embedding(512, 768)
  (layers): ModuleList(
    (0-11): 12 x GPTBlock(
      (ln1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (ln2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=768, out_features=3072, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=3072, out_features=768, bias=True)
        (3): Dropout(p=0.25, inplace=False)
      )
    )
  )
  (ln_final): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=768, out_features=16009, bias=False)
)

In [305]:
k = [345, 23, 44, 99]
kt = torch.tensor(k)

In [306]:
seq_len

512

In [307]:
anno, score = getEncodingOpen(test_df, random.randint(0,len(test_df)))

In [308]:
annot = torch.tensor(anno)

  annot = torch.tensor(anno)


In [309]:
decode_seq(anno.tolist())

'<BOS>Review: Perfect.<SEP>Rating:'

In [310]:
annot

tensor([16000,  7421,   292,   269,     3, 16002, 13254,   292])

In [311]:
score

tensor([4])

In [312]:
lens = len(anno)
anno_len = torch.tensor([lens])

# Pad sequences
padded_x = torch.zeros(1, 512, dtype=torch.long) + pad_token
padded_x[0,:lens] = torch.tensor(anno)

padded_x = padded_x.to(device)
anno_len = anno_len.to(device)

  padded_x[0,:lens] = torch.tensor(anno)


In [313]:
padded_x[0].shape

torch.Size([512])

In [314]:
decode_seq(padded_x[0].tolist())

'<BOS>Review: Perfect.<SEP>Rating:PADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPAD

In [315]:
out = model(padded_x, anno_len)

In [316]:
out.shape

torch.Size([1, 512, 16009])

In [317]:
tokenout = torch.argmax(out,dim=2)[0]

In [318]:
score

tensor([4])

In [319]:
tokenout.shape

torch.Size([512])

In [320]:
decode_seq(tokenout.tolist())

'Review: I for<SEP>Rating:<4>PADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPA

In [321]:
def predict_label_ids(model, x, pad_token):
    model.eval()
    with torch.no_grad():
        logits = model(x)                    # (B, T, V)
        nonpad_len = (x != pad_token).sum(dim=1)   # (B,)
        last_pos = nonpad_len - 1
        last_logits = logits[torch.arange(x.size(0)), last_pos]   # (B, V)
        probs = last_logits.softmax(dim=-1)
        top5 = probs.topk(5, dim=-1)
        top1 = probs.topk(1, dim = -1)
        return top5, probs[:, 16003:16008], top1   # top5 and probs specifically for label tokens

# Example:
#x = torch.LongTensor([padded_x]).to(device)
top5, label_probs, top1 = predict_label_ids(model, padded_x, pad_token)
print("label_probs:", label_probs.cpu().numpy())
print("top5 ids:", top5.indices.cpu().numpy())
print("top1 ids:", top1.indices.cpu().numpy())

label_probs: [[2.4788746e-05 3.9458470e-07 1.9498410e-05 7.6452428e-03 9.9216765e-01]]
top5 ids: [[16007 16006 16003 16005   193]]
top1 ids: [[16007]]


In [322]:
decode_seq(top1[1].tolist()[0]), score.item()

('<4>', 4)

In [323]:
class ICLDataset(Dataset):
    def __init__(self, df, shots, seq_len):
        self.df = df
        self.shots = shots
        self.seq_len = seq_len
    
    def __len__(self):
        return len(self.df)

    def getpre(self, idx, l):
        for _ in range(10):
            ixs = []
            xs = []
            for j in range(self.shots):
                sel = idx
                while(sel==idx):
                    sel = random.randint(0, self.__len__() -1)
                xs.extend(getEncoding(self.df, sel))
            if(len(xs) + l <= seq_len):
                return torch.LongTensor(xs)
        raise ValueError(f"can't fit {self.shots} examples in context")
                
            
    
    def __getitem__(self, idx):
        x, y = getEncodingOpen(self.df, idx)
        l = len(x)
        pre = self.getpre(idx, l)
        icl_x = torch.cat((pre,x))
        return icl_x, y

# Padding collate function for variable length sequences
def collate_fn_icl(batch):
    seqs_x, y = zip(*batch)
    lens = [len(s) for s in seqs_x]
    max_len = max(lens)
    
    # Pad sequences
    padded_x = torch.zeros(len(seqs_x), max_len, dtype=torch.long) + pad_token
    
    for i, x in enumerate(seqs_x):
        padded_x[i, :len(x)] = x
    
    return padded_x, torch.LongTensor(y), torch.LongTensor(lens)



In [324]:
icl_dataset = ICLDataset(train_df, 0, seq_len)
icl_loader = DataLoader(icl_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_icl)

In [325]:
for batch_idx, (x, y, lengths) in enumerate(icl_loader):
    print("bi")
    print(batch_idx)
    print("x")
    print(x.shape)
    xl = x[0].tolist()
    print(xl)
    print()
    
    
    print(decode_seq(xl))
    print("y")
    print(y)
    print(y.shape)
    print("lengths")
    print(lengths.shape)
    break

bi
0
x
torch.Size([8, 27])
[16000, 7421, 292, 39, 278, 99, 1600, 547, 94, 5, 748, 1000, 152, 115, 594, 14, 11, 4424, 128, 774, 3, 16002, 13254, 292, 16008, 16008, 16008]

<BOS>Review: The leather has pulled away from the chain purse after only using it for (1) month.<SEP>Rating:PADPADPAD
y
tensor([0, 3, 4, 1, 2, 0, 1, 1])
torch.Size([8])
lengths
torch.Size([8])


In [326]:
def israting(s):
    bnk = ["0","1","2","3","4"]
    return s in bnk
def isnum(s):
    bnk = [str(n) for n in range(0,20)]
    return s in bnk

def check_token_list(token_list):
    isratings = 0
    isnums = 0
    for token in token_list:
        isratings += 1 if israting(token) else 0
        isnums += 1 if isnum(token) else 0
    return isratings, isnums

In [327]:
d

{16000: '<BOS>',
 16001: '<EOS>',
 16002: '<SEP>',
 16003: '<0>',
 16004: '<1>',
 16005: '<2>',
 16006: '<3>',
 16007: '<4>',
 16008: 'PAD'}

In [328]:
def get_score(x):
    x = x.tolist()
    for i in range(len(x)-1,0, -1):
        if 16003<= x[i] <=16007:
            return x[i]
    return 20000


In [329]:
test_ex = torch.tensor([ 7421,   292,     4,     6,   945,    35,   161,    19,    31,   161,
             6,    18,   161,    19,    10,    29,    26,   143,   204,     3,
            28,   321,    14,   143,    75,   460, 13254,   292, 16000, 16008,
         16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008,
         16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008,
         16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008,
         16008])
vv = get_score(test_ex)

In [330]:
vv

20000

In [331]:
def getTokenScore(t):
    if(t==20000):
        return '-'
    return d[t][1]

In [332]:
icl_dataset = ICLDataset(train_df, 0, seq_len)
icl_loader = DataLoader(icl_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_icl)

In [333]:
tot = 0
num = 0
ratings = 0
correct = 0
for batch_idx, (x, y, lengths) in enumerate(icl_loader):
    x = x.to(device)
    y = y.to(device)
    lengths = lengths.to(device)
    pred_logits = model(x,lengths)
    print(x.shape)
    print(x[0])
    for sequence in x:
        print(decode_seq(sequence.tolist()))
        print()
    print("========================")
    pred_tokens = torch.argmax(pred_logits,dim=2)
    pred_tokens = pred_tokens.to('cpu')
    #pred_tokens = trim_tail(pred_tokens, pad_token)
    #pred_last_token = pred_tokens[:,-1].tolist()
    pred_last_token = [get_score(i) for i in pred_tokens]
    for sequence in pred_tokens:
        print(decode_seq(sequence.tolist()))
        print()
    pred_scores = []
    print(pred_last_token)
    for token in pred_last_token:
        if(token<=15999):
            pred_scores.append(tok.decode([token]))
        else:
            pred_scores.append(getTokenScore(token))

    print('---')
    print(pred_scores)
    
    ys = [str(ans) for ans in y.tolist()]
    print(ys)
    tot += len(y)
    for t in range(len(y)):
        correct += 1 if pred_scores[t] == ys[t] else 0
    isratings, isnums = check_token_list(pred_scores)
    num += isnums
    ratings += isratings
    torch.cuda.empty_cache()
    break

torch.Size([8, 29])
tensor([16000,  7421,   292,   266,    35,    49,    35,    18,    98,  4521,
           15,    23,   140,    47, 16002, 13254,   292, 16008, 16008, 16008,
        16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008],
       device='cuda:0')
<BOS>Review: Not as comfortable as my other sauconys but still good<SEP>Rating:PADPADPADPADPADPADPADPADPADPADPADPAD

<BOS>Review: Boots came in PERFECT condition and fit just as expected<SEP>Rating:PADPADPADPADPADPADPADPADPADPADPADPADPAD

<BOS>Review: I bought dickies before & were knee length with 13" inseam these are halfway down my calf,will return them!<SEP>Rating:

<BOS>Review: It's ok.<SEP>Rating:PADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPAD

<BOS>Review: Love them!<SEP>Rating:PADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPAD

<BOS>Review: It was not a bad fit and worked out well. I will order other trousers in the near future.<SEP>Rating:PADPAD

<BOS>Review: Poor quality, it leaked the first 

In [334]:
print("tot: ", tot)
print("num: ", num)
print("ratings: ", ratings)
print("correct: ", correct)

tot:  8
num:  8
ratings:  8
correct:  0


In [382]:
icl_dataset = ICLDataset(test_df, 5, seq_len)
icl_loader = DataLoader(icl_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_icl)

In [383]:
gt_ans = []
pred_ans = []

In [385]:
tot = 0
num = 0
ratings = 0
correct = 0
for batch_idx, (x, y, lengths) in enumerate(icl_loader):
    x = x.to(device)
    y = y.to(device)
    lengths = lengths.to(device)
    pred_logits = model(x,lengths)
    #print(x.shape)
    #print(x[0])
    #for sequence in x:
    #    print(decode_seq(sequence.tolist()))
    #    print()
    #print("========================")
    pred_tokens = torch.argmax(pred_logits,dim=2)
    pred_tokens = pred_tokens.to('cpu')
    #pred_tokens = trim_tail(pred_tokens, pad_token)
    #pred_last_token = pred_tokens[:,-1].tolist()
    pred_last_token = [get_score(i) for i in pred_tokens]
    #for sequence in pred_tokens:
    #    print(decode_seq(sequence.tolist()))
    #    print()
    pred_scores = []
    #print(pred_last_token)
    for token in pred_last_token:
        if(token<=15999):
            pred_scores.append(tok.decode([token]))
        else:
            pred_scores.append(getTokenScore(token))

    #print('---')
    #print(pred_scores)
    ys = [str(ans) for ans in y.tolist()]

    pred_ans.extend(pred_scores)
    gt_ans.extend(ys)
    #print(ys)
    tot += len(y)
    for t in range(len(y)):
        correct += 1 if pred_scores[t] == ys[t] else 0
    isratings, isnums = check_token_list(pred_scores)
    num += isnums
    ratings += isratings
    del pred_logits
    torch.cuda.empty_cache()

OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 15.55 GiB of which 8.31 MiB is free. Including non-PyTorch memory, this process has 4.64 GiB memory in use. Process 56626 has 10.46 GiB memory in use. Of the allocated memory 3.87 GiB is allocated by PyTorch, and 474.04 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
print("tot: ", tot)
print("num: ", num)
print("ratings: ", ratings)
print("correct: ", correct)
print("acc: ", correct/num)

In [377]:
diffs = [(int(gt_ans[i]) - int(pred_ans[i]))**2 for i in range(len(gt_ans))]

In [378]:
print("MSE")
print(sum(diffs)/len(diffs))

MSE
2.9982608695652173


In [None]:
5358/15387 #0

In [None]:
 4778 / 15396