In [1]:
import pandas as pd
import re
import math
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

import sentencepiece as spm

from tqdm import tqdm
import random

file = "amazon_review.csv"

data = pd.read_csv(file).dropna(ignore_index=True)
data['overall'] = data["overall"] - 1

vocab_size = 16000
seq_len = 512
pad_token = 16008

spm.SentencePieceTrainer.train(
    input='amazon_reviews.txt',
    model_prefix='amazon_reviews',
    vocab_size=vocab_size,
    model_type='unigram',
    character_coverage=1.0
)

tok = spm.SentencePieceProcessor(model_file='amazon_reviews.model')

filter_ = 1

ls  = [len(tok.encode(i, out_type=int)) for i in data["reviewText"]]

data["lengths"] = ls

data_trunc = data[data["lengths"]<=40]

v = data_trunc["overall"].value_counts()

balanced_data = (
    data_trunc.groupby("overall")
      .sample(n=min(v), random_state=42)
      .reset_index(drop=True)
)

len(balanced_data)

df_shuffled = balanced_data.sample(frac=filter_, random_state=42).reset_index(drop=True)

# Split the shuffled DataFrame
train_size = 0.8
train_df = df_shuffled.sample(frac=train_size, random_state=42).reset_index(drop=True)
test_df = df_shuffled.drop(train_df.index).reset_index(drop=True)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: amazon_reviews.txt
  input_format: 
  model_prefix: amazon_reviews
  model_type: UNIGRAM
  vocab_size: 16000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0

In [2]:
print("len train df: ", len(train_df))
print("len test df: ", len(test_df))

len train df:  11368
len test df:  2842


In [3]:
torch.cuda.empty_cache()

In [4]:
# get one input output pair with special tokens for later concatenation with other pairs for one sequence under max sequence length
def getEncodingOpen(df, i):
    reviewtext = "Review: "+ df["reviewText"].iloc[i]
    rating = "Rating: " 
    #row = [16000] + tok.encode(reviewText, out_type=int) + [16001] + tok.encode([int(df["overall"].iloc[i])], out_type = int)
    row = [16000] + tok.encode(reviewtext, out_type = int) + [16002] + tok.encode(rating, out_type = int)
    correct_output_rating = int(df["overall"].iloc[i])
    row = torch.LongTensor(row)
    correct_output_rating = torch.LongTensor([correct_output_rating])
    return row, correct_output_rating
# get one input output pair with special tokens for later concatenation with other pairs for one sequence under max sequence length
def getEncoding(df, i):
    reviewtext = "Review: "+ df["reviewText"].iloc[i]
    rating = "Rating: " #+ str(int(df["overall"].iloc[i]))
    score = int(df["overall"].iloc[i])
    row = [16000] + tok.encode(reviewtext, out_type = int) + [16002] + tok.encode(rating, out_type = int) +[score+16003]+ [16001]
    return row


In [5]:
d = {16000: "<BOS>", 16001: "<EOS>",16002: "<SEP>",16003: "<0>",16004: "<1>",16005: "<2>",16006: "<3>",16007: "<4>", 16008: "PAD"}

def decode_seq(seq):
    outp = ""
    sofar = []
    for i in seq:
        if(i<=15999):
            sofar.append(i)
        else:
            outp += tok.decode(sofar)
            outp += d[i]
            sofar = []
    outp += tok.decode(sofar)
    return outp


In [6]:
decode_seq(getEncoding(train_df, 2))

'<BOS>Review: deception is written that a lot of cotton and it is very little one synthetics. Instead of the declared 65 percent of 35 percent. unpleasant for such deception<SEP>Rating:<0><EOS>'

In [7]:
train_df["overall"].iloc[2]

np.float64(0.0)

In [8]:
print(seq_len)

512


In [9]:
def getShiftSeq(df_t, max_seq=1024):
    seqs_x = []
    seqs_y = []
    seqs = []
    c = []
    for i in range(len(df_t)):
        row = getEncoding(df_t, i)
        if len(c) + len(row) > max_seq +1:
            seqs_x.append(c[:-1])
            seqs_y.append(c[1:])
            seqs.append(c)
            c = []
        c.extend(row)
    return seqs_x, seqs_y, seqs

train_seqs_x, train_seqs_y, train_seqs = getShiftSeq(train_df, max_seq=seq_len)
test_seqs_x, test_seqs_y, test_seqs = getShiftSeq(test_df, max_seq=seq_len)

In [10]:
len(train_seqs_x[0])

509

In [11]:
len(train_seqs[0])

510

In [12]:
decode_seq(train_seqs[0][-15:])

'love the shoes but I ordered the wrong size!<SEP>Rating:<3><EOS>'

In [13]:
train_seqs_x[0][-15:]

[4, 68, 5, 44, 23, 4, 88, 5, 346, 30, 34, 16002, 13254, 292, 16006]

In [14]:
train_seqs[0][-15:]

[68, 5, 44, 23, 4, 88, 5, 346, 30, 34, 16002, 13254, 292, 16006, 16001]

In [15]:
class TokenDatasetB(Dataset):
    def __init__(self, seqs_x, seqs_y):
        self.seqs_x = seqs_x
        self.seqs_y = seqs_y
    
    def __len__(self):
        return len(self.seqs_x)
    
    def __getitem__(self, idx):
        return torch.LongTensor(self.seqs_x[idx]), torch.LongTensor(self.seqs_y[idx])

# Padding collate function for variable length sequences
def collate_fnB(batch):
    seqs_x, seqs_y = zip(*batch)
    lens = [len(s) for s in seqs_x]
    max_len = max(lens)
    
    # Pad sequences
    padded_x = torch.zeros(len(seqs_x), max_len, dtype=torch.long) + pad_token
    padded_y = torch.zeros(len(seqs_y), max_len, dtype=torch.long) + pad_token
    
    for i, (x, y) in enumerate(zip(seqs_x, seqs_y)):
        padded_x[i, :len(x)] = x
        padded_y[i, :len(y)] = y
    
    return padded_x, padded_y, torch.LongTensor(lens)

dataset = TokenDatasetB(train_seqs_x, train_seqs_y)
train_loader = DataLoader(dataset, batch_size=5, shuffle=True, collate_fn=collate_fnB)

In [16]:
for batch_idx, (x, y, lengths) in enumerate(train_loader):
    print("bi")
    print(batch_idx)
    print("x")
    print(x.shape)
    xl = x[0].tolist()
    print(xl)
    print()
    
    
    print(decode_seq(x[0].tolist()))
    print()
    print(decode_seq(x[1].tolist()))
    print()
    print(decode_seq(x[2].tolist()))
    print()
    print(decode_seq(x[3].tolist()))
    print()
    print(decode_seq(x[4].tolist()))
    print()
    print("y")
    print(y)
    print(y.shape)
    print("lengths")
    print(lengths.shape)
    break

bi
0
x
torch.Size([5, 504])
[16000, 7421, 292, 4, 10, 76, 8, 30, 1184, 304, 6, 23, 4, 227, 94, 899, 25, 8, 30, 493, 146, 65, 187, 3, 96, 44, 19, 38, 11, 5, 368, 3, 16002, 13254, 292, 16006, 16001, 16000, 7421, 292, 224, 4403, 352, 174, 32, 24, 44, 3, 45, 19, 8, 589, 1278, 23, 22, 111, 10, 29, 86, 5, 167, 4, 105, 3, 4, 58, 1419, 152, 589, 3, 16002, 13254, 292, 16004, 16001, 16000, 7421, 292, 564, 18, 50, 19, 276, 418, 125, 7, 22, 26, 51, 3, 96, 19, 167, 53, 78, 43, 8, 381, 4, 1408, 1238, 77, 538, 5, 30, 5009, 15, 37, 121, 3, 16002, 13254, 292, 16003, 16001, 16000, 7421, 292, 588, 62, 126, 4, 137, 8, 166, 7, 64, 9, 80, 8, 194, 54, 6, 762, 161, 307, 3, 3015, 10, 29, 46, 288, 24, 165, 3, 16002, 13254, 292, 16004, 16001, 16000, 7421, 292, 70, 8, 120, 443, 7, 312, 3, 16002, 13254, 292, 16003, 16001, 16000, 7421, 292, 567, 7, 3686, 43, 8, 856, 5083, 3, 16002, 13254, 292, 16003, 16001, 16000, 7421, 292, 139, 49, 7, 411, 3, 4, 485, 28, 8, 251, 17, 1682, 252, 3994, 2127, 7, 256, 27, 192, 619, 3,

In [17]:
if torch.cuda.is_available():
    print("CUDA is available!")
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"Current CUDA device name: {torch.cuda.get_device_name(0)}")
    device = 'cuda'
else:
    print("CUDA is not available. PyTorch will use the CPU.")
    device = 'cpu'

CUDA is available!
CUDA device count: 1
Current CUDA device name: NVIDIA GeForce RTX 4080 SUPER


In [18]:

def save_checkpoint(model, optimizer, step, path):
    ckpt = {
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "step": step,
    }
    torch.save(ckpt, path)


In [19]:
# LSTM Model
class LSTMNextToken(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, hidden_dim=2048, num_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, 
                           batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x, lengths):
        embedded = self.embedding(x)
        
        # Pack padded sequence
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        
        packed_out, _ = self.lstm(packed)
        
        # Unpack
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        
        output = self.dropout(output)
        logits = self.fc(output)
        
        return logits


In [20]:
len(train_loader)

115

In [21]:
# Training function
def train_model(model, train_loader, epochs=10, lr=1e-4, device='cuda'):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    track_loss = []
    global_step = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        loader = tqdm(train_loader)
        
        for x, y, lengths in loader:
            x, y = x.to(device), y.to(device)
            lengths = lengths.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            logits = model(x, lengths)
            
            # Reshape for loss calculation
            logits = logits.view(-1, logits.size(-1))
            y = y.view(-1)
            
            loss = criterion(logits, y)
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            total_loss += loss.item()
            track_loss.append(loss.item())
            avg_loss = sum(track_loss[-10:]) / 10
            loader.set_postfix(loss=avg_loss)
            del logits
            torch.cuda.empty_cache()
            global_step+=1

            ckpt_path = f"ckpt_{epoch}.pt"
            if global_step % 2500 == 0:
                save_checkpoint(model, optimizer, global_step, ckpt_path)
                print(f"Saved checkpoint at step {global_step}")
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}')
    
    return model




In [22]:
dataset = TokenDatasetB(train_seqs_x, train_seqs_y)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fnB)


# Initialize model (set vocab_size to your tokenizer's vocab size + special tokens)
vocab_size = 16000+9  # Adjust based on your tokenizer
model = LSTMNextToken(vocab_size=vocab_size,  num_layers=12)

In [23]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")

Total parameters: 431265929


In [None]:
# Train
model = train_model(model, train_loader, epochs=20, lr=1e-4)


100%|██████████████████████████████| 144/144 [03:18<00:00,  1.38s/it, loss=5.73]


Epoch 1/20, Average Loss: 6.0085


100%|██████████████████████████████| 144/144 [03:17<00:00,  1.37s/it, loss=5.62]


Epoch 2/20, Average Loss: 5.6530


100%|██████████████████████████████| 144/144 [03:17<00:00,  1.37s/it, loss=5.64]


Epoch 3/20, Average Loss: 5.6418


100%|██████████████████████████████| 144/144 [03:17<00:00,  1.37s/it, loss=5.64]


Epoch 4/20, Average Loss: 5.6302


100%|██████████████████████████████| 144/144 [03:17<00:00,  1.37s/it, loss=5.63]


Epoch 5/20, Average Loss: 5.6266


100%|██████████████████████████████| 144/144 [03:17<00:00,  1.37s/it, loss=5.62]


Epoch 6/20, Average Loss: 5.6183


100%|██████████████████████████████| 144/144 [03:17<00:00,  1.37s/it, loss=5.59]


Epoch 7/20, Average Loss: 5.6189


 35%|██████████▉                    | 51/144 [01:10<02:07,  1.37s/it, loss=5.57]

In [None]:
model.eval()

In [None]:
k = [345, 23, 44, 99]
kt = torch.tensor(k)

In [None]:
seq_len

In [None]:
anno, score = getEncodingOpen(test_df, random.randint(0,len(test_df)))

In [None]:
annot = torch.tensor(anno)

In [None]:
decode_seq(anno.tolist())

In [None]:
annot

In [None]:
score

In [None]:
lens = len(anno)
anno_len = torch.tensor([lens])

# Pad sequences
padded_x = torch.zeros(1, 512, dtype=torch.long) + pad_token
padded_x[0,:lens] = torch.tensor(anno)

padded_x = padded_x.to(device)
anno_len = anno_len.to(device)

In [None]:
padded_x[0].shape

In [None]:
decode_seq(padded_x[0].tolist())

In [None]:
out = model(padded_x, anno_len)

In [None]:
out.shape

In [None]:
tokenout = torch.argmax(out,dim=2)[0]

In [None]:
score

In [None]:
tokenout.shape

In [None]:
decode_seq(tokenout.tolist())

In [None]:
def predict_label_ids(model, x, pad_token):
    model.eval()
    with torch.no_grad():
        logits = model(x)                    # (B, T, V)
        nonpad_len = (x != pad_token).sum(dim=1)   # (B,)
        last_pos = nonpad_len - 1
        last_logits = logits[torch.arange(x.size(0)), last_pos]   # (B, V)
        probs = last_logits.softmax(dim=-1)
        top5 = probs.topk(5, dim=-1)
        top1 = probs.topk(1, dim = -1)
        return top5, probs[:, 16003:16008], top1   # top5 and probs specifically for label tokens

# Example:
#x = torch.LongTensor([padded_x]).to(device)
top5, label_probs, top1 = predict_label_ids(model, padded_x, pad_token)
print("label_probs:", label_probs.cpu().numpy())
print("top5 ids:", top5.indices.cpu().numpy())
print("top1 ids:", top1.indices.cpu().numpy())

In [None]:
decode_seq(top1[1].tolist()[0]), score.item()

In [None]:
class ICLDataset(Dataset):
    def __init__(self, df, shots, seq_len):
        self.df = df
        self.shots = shots
        self.seq_len = seq_len
    
    def __len__(self):
        return len(self.df)

    def getpre(self, idx, l):
        for _ in range(10):
            ixs = []
            xs = []
            for j in range(self.shots):
                sel = idx
                while(sel==idx):
                    sel = random.randint(0, self.__len__() -1)
                xs.extend(getEncoding(self.df, sel))
            if(len(xs) + l <= seq_len):
                return torch.LongTensor(xs)
        raise ValueError(f"can't fit {self.shots} examples in context")
                
            
    
    def __getitem__(self, idx):
        x, y = getEncodingOpen(self.df, idx)
        l = len(x)
        pre = self.getpre(idx, l)
        icl_x = torch.cat((pre,x))
        return icl_x, y

# Padding collate function for variable length sequences
def collate_fn_icl(batch):
    seqs_x, y = zip(*batch)
    lens = [len(s) for s in seqs_x]
    max_len = max(lens)
    
    # Pad sequences
    padded_x = torch.zeros(len(seqs_x), max_len, dtype=torch.long) + pad_token
    
    for i, x in enumerate(seqs_x):
        padded_x[i, :len(x)] = x
    
    return padded_x, torch.LongTensor(y), torch.LongTensor(lens)



In [None]:
icl_dataset = ICLDataset(train_df, 0, seq_len)
icl_loader = DataLoader(icl_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_icl)

In [None]:
for batch_idx, (x, y, lengths) in enumerate(icl_loader):
    print("bi")
    print(batch_idx)
    print("x")
    print(x.shape)
    xl = x[0].tolist()
    print(xl)
    print()
    
    
    print(decode_seq(xl))
    print("y")
    print(y)
    print(y.shape)
    print("lengths")
    print(lengths.shape)
    break

In [None]:
def israting(s):
    bnk = ["0","1","2","3","4"]
    return s in bnk
def isnum(s):
    bnk = [str(n) for n in range(0,20)]
    return s in bnk

def check_token_list(token_list):
    isratings = 0
    isnums = 0
    for token in token_list:
        isratings += 1 if israting(token) else 0
        isnums += 1 if isnum(token) else 0
    return isratings, isnums

In [None]:
d

In [None]:
def get_score(x):
    x = x.tolist()
    for i in range(len(x)-1,0, -1):
        if 16003<= x[i] <=16007:
            return x[i]
    return 20000


In [None]:
test_ex = torch.tensor([ 7421,   292,     4,     6,   945,    35,   161,    19,    31,   161,
             6,    18,   161,    19,    10,    29,    26,   143,   204,     3,
            28,   321,    14,   143,    75,   460, 13254,   292, 16000, 16008,
         16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008,
         16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008,
         16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008,
         16008])
vv = get_score(test_ex)

In [None]:
vv

In [None]:
def getTokenScore(t):
    if(t==20000):
        return '-'
    return d[t][1]

In [None]:
icl_dataset = ICLDataset(train_df, 0, seq_len)
icl_loader = DataLoader(icl_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_icl)

In [None]:
tot = 0
num = 0
ratings = 0
correct = 0
for batch_idx, (x, y, lengths) in enumerate(icl_loader):
    x = x.to(device)
    y = y.to(device)
    lengths = lengths.to(device)
    pred_logits = model(x,lengths)
    print(x.shape)
    print(x[0])
    for sequence in x:
        print(decode_seq(sequence.tolist()))
        print()
    print("========================")
    pred_tokens = torch.argmax(pred_logits,dim=2)
    pred_tokens = pred_tokens.to('cpu')
    #pred_tokens = trim_tail(pred_tokens, pad_token)
    #pred_last_token = pred_tokens[:,-1].tolist()
    pred_last_token = [get_score(i) for i in pred_tokens]
    for sequence in pred_tokens:
        print(decode_seq(sequence.tolist()))
        print()
    pred_scores = []
    print(pred_last_token)
    for token in pred_last_token:
        if(token<=15999):
            pred_scores.append(tok.decode([token]))
        else:
            pred_scores.append(getTokenScore(token))

    print('---')
    print(pred_scores)
    
    ys = [str(ans) for ans in y.tolist()]
    print(ys)
    tot += len(y)
    for t in range(len(y)):
        correct += 1 if pred_scores[t] == ys[t] else 0
    isratings, isnums = check_token_list(pred_scores)
    num += isnums
    ratings += isratings
    torch.cuda.empty_cache()
    break

In [None]:
print("tot: ", tot)
print("num: ", num)
print("ratings: ", ratings)
print("correct: ", correct)

In [None]:
icl_dataset = ICLDataset(test_df, 2, seq_len)
icl_loader = DataLoader(icl_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_icl)

In [None]:
tot = 0
num = 0
ratings = 0
correct = 0
for batch_idx, (x, y, lengths) in enumerate(icl_loader):
    x = x.to(device)
    y = y.to(device)
    lengths = lengths.to(device)
    pred_logits = model(x,lengths)
    #print(x.shape)
    #print(x[0])
    #for sequence in x:
    #    print(decode_seq(sequence.tolist()))
    #    print()
    #print("========================")
    pred_tokens = torch.argmax(pred_logits,dim=2)
    pred_tokens = pred_tokens.to('cpu')
    #pred_tokens = trim_tail(pred_tokens, pad_token)
    #pred_last_token = pred_tokens[:,-1].tolist()
    pred_last_token = [get_score(i) for i in pred_tokens]
    #for sequence in pred_tokens:
    #    print(decode_seq(sequence.tolist()))
    #    print()
    pred_scores = []
    #print(pred_last_token)
    for token in pred_last_token:
        if(token<=15999):
            pred_scores.append(tok.decode([token]))
        else:
            pred_scores.append(getTokenScore(token))

    #print('---')
    #print(pred_scores)
    
    ys = [str(ans) for ans in y.tolist()]
    #print(ys)
    tot += len(y)
    for t in range(len(y)):
        correct += 1 if pred_scores[t] == ys[t] else 0
    isratings, isnums = check_token_list(pred_scores)
    num += isnums
    ratings += isratings
    torch.cuda.empty_cache()

In [None]:
print("tot: ", tot)
print("num: ", num)
print("ratings: ", ratings)
print("correct: ", correct)
print("acc: ", correct/num)

In [None]:
5358/15387 #0

In [None]:
 4778 / 15396