In [1]:
%pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import re
import math
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

import sentencepiece as spm

from tqdm import tqdm
import random

file = "amazon_review.csv"

data = pd.read_csv(file).dropna(ignore_index=True)
data['overall'] = data["overall"] - 1

vocab_size = 16000
seq_len = 1024

spm.SentencePieceTrainer.train(
    input='amazon_reviews.txt',
    model_prefix='amazon_reviews',
    vocab_size=vocab_size,
    model_type='unigram',
    character_coverage=1.0
)

tok = spm.SentencePieceProcessor(model_file='amazon_reviews.model')

filter_ = 1

ls  = [len(tok.encode(i, out_type=int)) for i in data["reviewText"]]

data["lengths"] = ls

data_trunc = data[data["lengths"]<=80]

v = data_trunc["overall"].value_counts()

balanced_data = (
    data_trunc.groupby("overall")
      .sample(n=min(v), random_state=42)
      .reset_index(drop=True)
)

len(balanced_data)

df_shuffled = balanced_data.sample(frac=filter_, random_state=42).reset_index(drop=True)

# Split the shuffled DataFrame
train_size = 0.8
train_df = df_shuffled.sample(frac=train_size, random_state=42).reset_index(drop=True)
test_df = df_shuffled.drop(train_df.index).reset_index(drop=True)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: amazon_reviews.txt
  input_format: 
  model_prefix: amazon_reviews
  model_type: UNIGRAM
  vocab_size: 16000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0

In [3]:
print("len train df: ", len(train_df))
print("len test df: ", len(test_df))

len train df:  15396
len test df:  3849


In [4]:
# get one input output pair with special tokens for later concatenation with other pairs for one sequence under max sequence length
def getEncodingOpen(df, i):
    reviewtext = "Review: "+ df["reviewText"].iloc[i]
    rating = "Rating: " 
    #row = [16000] + tok.encode(reviewText, out_type=int) + [16001] + tok.encode([int(df["overall"].iloc[i])], out_type = int)
    row = [16000] + tok.encode(reviewtext, out_type = int) + [16002] + tok.encode(rating, out_type = int)
    correct_output_rating = int(df["overall"].iloc[i])
    row = torch.LongTensor(row)
    correct_output_rating = torch.LongTensor([correct_output_rating])
    return row, correct_output_rating

In [5]:
# get one input output pair with special tokens for later concatenation with other pairs for one sequence under max sequence length
def getEncoding(df, i):
    reviewtext = "Review: "+ df["reviewText"].iloc[i]
    rating_val = int(df["overall"].iloc[i]) + 16003
    #row = [16000] + tok.encode(reviewText, out_type=int) + [16001] + tok.encode([int(df["overall"].iloc[i])], out_type = int)
    row = [16000] + tok.encode(reviewtext, out_type = int) + [16002] + tok.encode("Rating: ", out_type = int) + [rating_val] + [16001]
    return row

def getShiftSeq(df_t, max_seq=1024):
    seqs_x = []
    seqs_y = []
    seqs = []
    c = []
    for i in range(len(df_t)):
        row = getEncoding(df_t, i)
        if len(c) + len(row) > max_seq +1:
            seqs_x.append(c[:-1])
            seqs_y.append(c[1:])
            seqs.append(c)
            c = []
        c.extend(row)
    return seqs_x, seqs_y, seqs

train_seqs_x, train_seqs_y, train_seqs = getShiftSeq(train_df, max_seq=seq_len)
test_seqs_x, test_seqs_y, test_seqs = getShiftSeq(test_df, max_seq=seq_len)

In [6]:
class TokenDatasetB(Dataset):
    def __init__(self, seqs_x, seqs_y):
        self.seqs_x = seqs_x
        self.seqs_y = seqs_y
    
    def __len__(self):
        return len(self.seqs_x)
    
    def __getitem__(self, idx):
        return torch.LongTensor(self.seqs_x[idx]), torch.LongTensor(self.seqs_y[idx])

# Padding collate function for variable length sequences
def collate_fnB(batch):
    seqs_x, seqs_y = zip(*batch)
    lens = [len(s) for s in seqs_x]
    max_len = max(lens)
    
    # Pad sequences
    padded_x = torch.zeros(len(seqs_x), max_len, dtype=torch.long)
    padded_y = torch.zeros(len(seqs_y), max_len, dtype=torch.long)
    
    for i, (x, y) in enumerate(zip(seqs_x, seqs_y)):
        padded_x[i, :len(x)] = x
        padded_y[i, :len(y)] = y
    
    return padded_x, padded_y, torch.LongTensor(lens)

dataset = TokenDatasetB(train_seqs_x, train_seqs_y)
train_loader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fnB)

In [7]:
for batch_idx, (x, y, lengths) in enumerate(train_loader):
    print("bi")
    print(batch_idx)
    print("x")
    print(x)
    print(x.shape)
    print("y")
    print(y)
    print(y.shape)
    print("lengths")
    print(lengths.shape)
    break

lengths[:10]

bi
0
x
tensor([[16000,  7421,   292,     4,    81,   198,   215,     6,    62,   250,
             7,    62,   370,     3,    70,   474,    43,     4,    10,    76,
           123,     8,  6753,     3,   834,     4,   160,     8,   445,  1749,
            79,     3, 16002, 13254,   292, 16004, 16001, 16000,  7421,   292,
            96,   519,    19,  2611,     3,     4,    10,    76,     8,   241,
           500,   800,  6058,    10,  2517,    84,   128,     7,     4,    10,
            76,  1873,   355,     3,     4,    81,     5,   517,   959,  1017,
           249,    18,   355,    19,    37,   103,    23,    22,    19,  2106,
           103,     3,    39,   517,   959,  1088,    69,    27,   172,   127,
            53,   103,     3,   315,    46,  3569,     9,  2322,   136,    50,
             3, 16002, 13254,   292, 16006, 16001, 16000,  7421,   292,    45,
           809,   532,     9,    78,     3,     4,    88,   543,    15,     7,
            22,    26,    43,   466,    15,  

tensor([961])

In [8]:
if torch.cuda.is_available():
    print("CUDA is available!")
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"Current CUDA device name: {torch.cuda.get_device_name(0)}")
    device = 'cuda'
else:
    print("CUDA is not available. PyTorch will use the CPU.")
    device = 'cpu'

CUDA is available!
CUDA device count: 1
Current CUDA device name: NVIDIA A100-SXM4-40GB


In [9]:
class GPTBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super().__init__()

        self.ln1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(
            embed_dim, num_heads, dropout=dropout, batch_first=True
        )

        self.ln2 = nn.LayerNorm(embed_dim)

        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout),
        )

    def forward(self, x, causal_mask, padding_mask):
        # Self-attention (GPT-style)
        h = self.ln1(x)
        attn_out, _ = self.attn(
            h, h, h,
            attn_mask=causal_mask,
            key_padding_mask=padding_mask,
            need_weights=False
        )
        x = x + attn_out

        # Feedforward
        h = self.ln2(x)
        ff_out = self.mlp(h)
        x = x + ff_out

        return x
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, max_len=1024,
                 embed_dim=512, num_heads=4,
                 num_layers=4, mlp_dim=1024, dropout=0.2):
        super().__init__()

        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.pos = nn.Embedding(max_len, embed_dim)

        self.layers = nn.ModuleList([
            GPTBlock(embed_dim, num_heads, mlp_dim, dropout)
            for _ in range(num_layers)
        ])

        self.ln_final = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size, bias=False)
        self.head.weight = self.embed.weight  # weight tying

    def causal_mask(self, T, device):
        mask = torch.triu(torch.ones(T, T, device=device), 1)
        return mask.masked_fill(mask == 1, float('-inf'))

    def forward(self, x, lengths=None):
        B, T = x.shape
        device = x.device

        tok = self.embed(x)
        pos = self.pos(torch.arange(T, device=device)[None, :])
        h = tok + pos

        causal = self.causal_mask(T, device)     # (T, T)
        pad_mask = (x == 0)                      # (B, T)

        for layer in self.layers:
            h = layer(h, causal, pad_mask)

        h = self.ln_final(h)
        return self.head(h)                      # (B, T, V)


In [10]:
# Training function
def train_model(model, train_loader, epochs=10, lr=1e-4, device='cuda'):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    track_loss = []
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        loader = tqdm(train_loader)
        
        for x, y, lengths in loader:
            x, y = x.to(device), y.to(device)
            lengths = lengths.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            logits = model(x, lengths)
            
            # Reshape for loss calculation
            logits = logits.view(-1, logits.size(-1))
            y = y.view(-1)
            
            loss = criterion(logits, y)
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            total_loss += loss.item()
            track_loss.append(loss.item())
            avg_loss = sum(track_loss[-10:]) / 10
            loader.set_postfix(loss=avg_loss)
            del logits
            torch.cuda.empty_cache()
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}')
    
    return model




In [11]:
dataset = TokenDatasetB(train_seqs_x, train_seqs_y)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fnB)


# Initialize model (set vocab_size to your tokenizer's vocab size + special tokens)
vocab_size = 16000+8  # Adjust based on your tokenizer
model = DecoderOnlyTransformer(vocab_size=vocab_size,  num_layers=12)

In [12]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")

Total parameters: 33692672


In [None]:
# Train
model = train_model(model, train_loader, epochs=20, lr=1e-4)




/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [66,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [66,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [66,0,0], thread: [98,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [66,0,0], thread: [99,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [66,0,0], thread: [100,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [66,0,0], thread: [101,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:155

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
k = [345, 23, 44, 99]
kt = torch.tensor(k)

In [None]:
seq_len

In [None]:
anno, score = getEncodingOpen(test_df, random.randint(0,len(test_df)))

In [None]:
annot = torch.tensor(anno)

In [None]:
lens = len(anno)
anno_len = torch.tensor([lens])

# Pad sequences
padded_x = torch.zeros(1, 512, dtype=torch.long)
padded_x[0,:lens] = torch.tensor(anno)

padded_x = padded_x.to(device)
anno_len = anno_len.to(device)

In [None]:
print(lens)

In [None]:
out = model(padded_x, anno_len)

In [None]:
out.shape

In [None]:
tokenout = torch.argmax(out,dim=2)[0]

In [None]:
score

In [None]:
tokenout.shape

In [None]:

tok.decode([min(15999,to) for to in tokenout.tolist()])

In [None]:
class ICLDataset(Dataset):
    def __init__(self, df, shots, seq_len):
        self.df = df
        self.shots = shots
        self.seq_len = seq_len
    
    def __len__(self):
        return len(self.df)

    def getpre(self, idx, l):
        for _ in range(10):
            ixs = []
            xs = []
            for j in range(self.shots):
                sel = idx
                while(sel==idx):
                    sel = random.randint(0, self.__len__() -1)
                xs.extend(getEncoding(self.df, sel))
            if(len(xs) + l <= seq_len):
                return torch.LongTensor(xs)
        raise ValueError(f"can't fit {self.shots} examples in context")
                
            
    
    def __getitem__(self, idx):
        x, y = getEncodingOpen(self.df, idx)
        l = len(x)
        pre = self.getpre(idx, l)
        icl_x = torch.cat((pre,x))
        return icl_x, y

# Padding collate function for variable length sequences
def collate_fn_icl(batch):
    seqs_x, y = zip(*batch)
    lens = [len(s) for s in seqs_x]
    max_len = max(lens)
    
    # Pad sequences
    padded_x = torch.zeros(len(seqs_x), max_len, dtype=torch.long)
    
    for i, x in enumerate(seqs_x):
        padded_x[i, :len(x)] = x
    
    return padded_x, torch.LongTensor(y), torch.LongTensor(lens)



In [None]:
icl_dataset = ICLDataset(train_df, 3, seq_len)
icl_loader = DataLoader(icl_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_icl)

for batch_idx, (x, y, lengths) in enumerate(icl_loader):
    print("bi")
    print(batch_idx)
    print("x")
    print(x)
    print(x.shape)
    print("y")
    print(y)
    print(y.shape)
    print("lengths")
    print(lengths.shape)
    break

In [None]:
def israting(s):
    bnk = ["0","1","2","3","4"]
    return s in bnk
def isnum(s):
    bnk = [str(n) for n in range(0,20)]
    return s in bnk

def check_token_list(token_list):
    isratings = 0
    isnums = 0
    for token in token_list:
        isratings += 1 if israting(token) else 0
        isnums += 1 if isnum(token) else 0
    return isratings, isnums

In [None]:
tok.decode([15999])

In [None]:
tok.decode([])

In [None]:
d = {16000: "<BOS>", 16001: "<EOS>",16002: "<SEP>",16003: "<0>",16004: "<1>",16005: "<2>",16006: "<3>",16007: "<4>"}

In [None]:
def decode_seq(seq):
    outp = ""
    sofar = []
    for i in seq:
        if(i<=15999):
            sofar.append(i)
        else:
            outp += tok.decode(sofar)
            outp += d[i]
    return outp

In [None]:
tot = 0
num = 0
ratings = 0
correct = 0
for batch_idx, (x, y, lengths) in enumerate(icl_loader):
    x = x.to(device)
    y = y.to(device)
    lengths = lengths.to(device)
    pred_logits = model(x,lengths)
    print(x.shape)
    print(x[0])
    for sequence in x:
        print(decode_seq(sequence.tolist()))
        print()
    print("========================")
    pred_tokens = torch.argmax(pred_logits,dim=2)
    pred_tokens = pred_tokens.to('cpu')
    pred_last_token = pred_tokens[:,-1].tolist()
    print(pred_tokens.shape)
    for sequence in pred_tokens:
        print(decode_seq(sequence.tolist()))
        print()
    pred_scores = [tok.decode([token]) for token in pred_last_token]
    print(pred_scores)
    print(y.tolist())
    tot += len(y)
    for t in range(len(y)):
        correct += 1 if pred_scores[t] == str(y[t]) else 0
    isratings, isnums = check_token_list(pred_scores)
    num += isnums
    ratings += isratings
    torch.cuda.empty_cache()
    break

In [None]:
print("tot: ", tot)
print("num: ", num)
print("ratings: ", ratings)
print("correct: ", correct)

In [None]:
def predict_rating(model, review_text, tokenizer, device='cuda'):
    model.eval()
    # Prepare input sequence: [16000] + Review + [16002] + Rating:
    # We want the model to predict the token AFTER 'Rating: '
    
    # Ensure review_text is a string
    if not isinstance(review_text, str):
        review_text = str(review_text)
    
    prefix_tokens = [16000] + tokenizer.encode("Review: " + review_text, out_type=int) + [16002] + tokenizer.encode("Rating: ", out_type=int)
    
    # Convert to tensor and add batch dimension
    x = torch.LongTensor([prefix_tokens]).to(device)
    
    with torch.no_grad():
        # Forward pass
        logits = model(x)
        # Get logits of the last token
        last_logits = logits[0, -1, :]
        # Greedy decoding
        pred_id = torch.argmax(last_logits).item()
        # print(last_logits.shape)
        # print(f"Logits:  {logits}")
        # print(f"predID: {pred_id}")
        
    return pred_id

print("\n--- Generating Predictions on Test Set ---")
if 'model' in locals() and 'test_df' in locals():
    model.to(device)
    import random

    for _ in range(5):
        idx = random.randint(0, len(test_df)-1)
        sample = test_df.iloc[idx]
        review = sample['reviewText']
        actual_val = int(sample['overall'])
        
        pred_token = predict_rating(model, review, tok, device)
        
        # Decode prediction
        # Rating 0 -> 16003, Rating 1 -> 16004, etc.
        predicted_rating = pred_token - 16003
        
        print(f"Review: {str(review)}")
        print(f"Actual Rating: {actual_val}")
        if 0 <= predicted_rating <= 5: # Assuming 0-4 scale
            print(f"Predicted Rating: {predicted_rating}")
        else:
            print(f"Predicted Token: {pred_token} (Raw ID) - Model might not have converged yet or is predicting a word instead of a rating.")
        print("-"*30)
else:
    print("Model or test_df not found. Please run the training cells first.")


In [None]:
import warnings
# Suppress specific PyTorch warning about mismatched masks
warnings.filterwarnings("ignore", message="Support for mismatched key_padding_mask and attn_mask is deprecated")
print("Warnings suppressed.")

In [None]:
def run_icl_experiment(model, test_df, tokenizer, shot_counts=[0, 1, 3, 5, 10], num_samples=500, device='cuda'):
    model.eval()
    results = {}
    
    print(f"Running ICL Experiment on {num_samples} samples per shot count...")
    
    # Pre-compute samples to ensure fairness (same targets for all shots?)
    # For simplicity and to match the previous logic, we'll generate fresh samples for each shot count
    # or we can try to keep targets consistent. Let's follow the notebook's pattern.
    
    for k in shot_counts:
        print(f"\nEvaluating {k}-shot performance...")
        correct = 0
        total = 0
        
        # We need to construct the prompts manually since ICLDataset logic was in a different notebook
        # and might rely on specific variables. Let's implement a self-contained loop.
        
        for i in range(num_samples):
            # 1. Pick a target
            target_idx = random.randint(0, len(test_df)-1)
            target_row = test_df.iloc[target_idx]
            target_review = str(target_row['reviewText'])
            target_label = int(target_row['overall'])
            
            # 2. Pick k shots (excluding target)
            shots_df = test_df.drop(target_idx)
            if k > 0:
                shots = shots_df.sample(n=k)
            else:
                shots = pd.DataFrame()
            
            # 3. Construct Prompt
            # Format: [Review 1] [Rating 1] ... [Review Target] [Rating: ]
            prompt_ids = [16000] # Start token if needed, or just start concatenation
            # Actually, let's stick to the format used in training/prediction
            # The model expects: [16000] Review [16002] Rating: [Label] [16001]
            
            full_prompt_ids = []
            
            # Add shots
            for _, shot_row in shots.iterrows():
                shot_review = str(shot_row['reviewText'])
                shot_label = int(shot_row['overall'])
                
                # Shot encoding: [16000] Review [16002] Rating: Label [16001]
                # Note: We must ensure this matches exactly what the model learned.
                # In getEncoding: [16000] + review + [16002] + "Rating: " + label + [16001]
                
                row_ids = [16000] + tokenizer.encode(shot_review, out_type=int) + \
                          [16002] + tokenizer.encode("Rating: ", out_type=int) + \
                          [shot_label + 16003] + [16001]
                full_prompt_ids.extend(row_ids)
            
            # Add target (without label)
            # Target encoding: [16000] Review [16002] Rating:
            target_ids = [16000] + tokenizer.encode(target_review, out_type=int) + \
                         [16002] + tokenizer.encode("Rating: ", out_type=int)
            
            full_prompt_ids.extend(target_ids)
            
            # 4. Truncate if too long (simple truncation from left)
            # Max seq len is 512. If prompt is longer, we lose the start.
            # Ideally we keep the target and as many shots as possible.
            if len(full_prompt_ids) > 1024: # giving some buffer, though model trained on 1024
                 full_prompt_ids = full_prompt_ids[-1024:]
            
            # 5. Predict
            x = torch.LongTensor([full_prompt_ids]).to(device)
            
            with torch.no_grad():
                logits = model(x)
                last_logits = logits[0, -1, :]
                pred_id = torch.argmax(last_logits).item()
            
            pred_rating = pred_id - 16003
            
            if pred_rating == target_label:
                correct += 1
            total += 1
        
        acc = correct / total
        results[k] = acc
        print(f"{k}-shot Accuracy: {acc:.4f}")
        
    return results

# Run the experiment
if 'model' in locals():
    icl_results = run_icl_experiment(model, test_df, tok, shot_counts=[0, 1, 3, 5, 8, 10], num_samples=200)
    print("\nFinal Results:", icl_results)
else:
    print("Model not loaded.")