In [68]:
import pandas as pd
import re
import math
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

import sentencepiece as spm

from tqdm import tqdm
import random

file = "amazon_review.csv"

data = pd.read_csv(file).dropna(ignore_index=True)
data['overall'] = data["overall"] - 1

vocab_size = 16000
seq_len = 512
pad_token = 16008

spm.SentencePieceTrainer.train(
    input='amazon_reviews.txt',
    model_prefix='amazon_reviews',
    vocab_size=vocab_size,
    model_type='unigram',
    character_coverage=1.0
)

tok = spm.SentencePieceProcessor(model_file='amazon_reviews.model')

filter_ = 1

ls  = [len(tok.encode(i, out_type=int)) for i in data["reviewText"]]

data["lengths"] = ls

data_trunc = data[data["lengths"]<=30]

v = data_trunc["overall"].value_counts()

balanced_data = (
    data_trunc.groupby("overall")
      .sample(n=min(v), random_state=42)
      .reset_index(drop=True)
)

len(balanced_data)

df_shuffled = balanced_data.sample(frac=filter_, random_state=42).reset_index(drop=True)

# Split the shuffled DataFrame
train_size = 0.8
train_df = df_shuffled.sample(frac=train_size, random_state=42).reset_index(drop=True)
test_df = df_shuffled.drop(train_df.index).reset_index(drop=True)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: amazon_reviews.txt
  input_format: 
  model_prefix: amazon_reviews
  model_type: UNIGRAM
  vocab_size: 16000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ‚Åá 
  enable_differential_privacy:

In [69]:
print("len train df: ", len(train_df))
print("len test df: ", len(test_df))

len train df:  9200
len test df:  2300


In [70]:
# get one input output pair with special tokens for later concatenation with other pairs for one sequence under max sequence length
def getEncodingOpen(df, i):
    reviewtext = "Review: "+ df["reviewText"].iloc[i]
    rating = "Rating: " 
    #row = [16000] + tok.encode(reviewText, out_type=int) + [16001] + tok.encode([int(df["overall"].iloc[i])], out_type = int)
    row = [16000] + tok.encode(reviewtext, out_type = int) + [16002] + tok.encode(rating, out_type = int)
    correct_output_rating = int(df["overall"].iloc[i])
    row = torch.LongTensor(row)
    correct_output_rating = torch.LongTensor([correct_output_rating])
    return row, correct_output_rating
# get one input output pair with special tokens for later concatenation with other pairs for one sequence under max sequence length
def getEncoding(df, i):
    reviewtext = "Review: "+ df["reviewText"].iloc[i]
    rating = "Rating: " #+ str(int(df["overall"].iloc[i]))
    score = int(df["overall"].iloc[i])
    row = [16000] + tok.encode(reviewtext, out_type = int) + [16002] + tok.encode(rating, out_type = int) +[score+16003]+ [16001]
    return row


In [71]:
d = {16000: "<BOS>", 16001: "<EOS>",16002: "<SEP>",16003: "<0>",16004: "<1>",16005: "<2>",16006: "<3>",16007: "<4>", 16008: "PAD"}

def decode_seq(seq):
    outp = ""
    sofar = []
    for i in seq:
        if(i<=15999):
            sofar.append(i)
        else:
            outp += tok.decode(sofar)
            outp += d[i]
            sofar = []
    outp += tok.decode(sofar)
    return outp


In [72]:
decode_seq(getEncoding(train_df, 2))

'<BOS>Review: The legs are to long. Ordered a 30" inseam. A person with a 33" inseam could wear these with no problem.<SEP>Rating:<2><EOS>'

In [73]:
train_df["overall"].iloc[2]

np.float64(2.0)

In [74]:
print(seq_len)

512


In [75]:
def getShiftSeq(df_t, max_seq=1024):
    seqs_x = []
    seqs_y = []
    seqs = []
    c = []
    for i in range(len(df_t)):
        row = getEncoding(df_t, i)
        if len(c) + len(row) > max_seq +1:
            seqs_x.append(c[:-1])
            seqs_y.append(c[1:])
            seqs.append(c)
            c = []
        c.extend(row)
    return seqs_x, seqs_y, seqs

train_seqs_x, train_seqs_y, train_seqs = getShiftSeq(train_df, max_seq=seq_len)
test_seqs_x, test_seqs_y, test_seqs = getShiftSeq(test_df, max_seq=seq_len)

In [76]:
len(train_seqs_x[0])

508

In [77]:
len(train_seqs[0])

509

In [78]:
decode_seq(train_seqs[0][-15:])

'good products, but they have become complete crap.<SEP>Rating:<0><EOS>'

In [79]:
train_seqs_x[0][-15:]

[238, 47, 686, 6, 23, 22, 27, 1097, 2032, 2912, 3, 16002, 13254, 292, 16003]

In [80]:
train_seqs[0][-15:]

[47, 686, 6, 23, 22, 27, 1097, 2032, 2912, 3, 16002, 13254, 292, 16003, 16001]

In [81]:
class TokenDatasetB(Dataset):
    def __init__(self, seqs_x, seqs_y):
        self.seqs_x = seqs_x
        self.seqs_y = seqs_y
    
    def __len__(self):
        return len(self.seqs_x)
    
    def __getitem__(self, idx):
        return torch.LongTensor(self.seqs_x[idx]), torch.LongTensor(self.seqs_y[idx])

# Padding collate function for variable length sequences
def collate_fnB(batch):
    seqs_x, seqs_y = zip(*batch)
    lens = [len(s) for s in seqs_x]
    max_len = max(lens)
    
    # Pad sequences
    padded_x = torch.zeros(len(seqs_x), max_len, dtype=torch.long) + pad_token
    padded_y = torch.zeros(len(seqs_y), max_len, dtype=torch.long) + pad_token
    
    for i, (x, y) in enumerate(zip(seqs_x, seqs_y)):
        padded_x[i, :len(x)] = x
        padded_y[i, :len(y)] = y
    
    return padded_x, padded_y, torch.LongTensor(lens)

dataset = TokenDatasetB(train_seqs_x, train_seqs_y)
train_loader = DataLoader(dataset, batch_size=5, shuffle=True, collate_fn=collate_fnB)

In [82]:
for batch_idx, (x, y, lengths) in enumerate(train_loader):
    print("bi")
    print(batch_idx)
    print("x")
    print(x.shape)
    xl = x[0].tolist()
    print(xl)
    print()
    
    
    print(decode_seq(x[0].tolist()))
    print()
    print(decode_seq(x[1].tolist()))
    print()
    print(decode_seq(x[2].tolist()))
    print()
    print(decode_seq(x[3].tolist()))
    print()
    print(decode_seq(x[4].tolist()))
    print()
    print("y")
    print(y)
    print(y.shape)
    print("lengths")
    print(lengths.shape)
    break

bi
0
x
torch.Size([5, 504])
[16000, 7421, 292, 70, 193, 967, 23, 31, 13, 316, 36, 11, 420, 73, 3, 1511, 15, 65, 567, 239, 73, 4, 41, 14, 3, 3, 16002, 13254, 292, 16004, 16001, 16000, 7421, 292, 567, 16002, 13254, 292, 16004, 16001, 16000, 7421, 292, 931, 937, 11, 89, 16002, 13254, 292, 16006, 16001, 16000, 7421, 292, 40, 49, 16002, 13254, 292, 16006, 16001, 16000, 7421, 292, 412, 681, 35, 4, 1451, 5, 981, 6, 4, 944, 8, 1610, 28, 5, 453, 25, 2550, 9, 86, 43, 530, 1431, 85, 778, 530, 1450, 884, 296, 16002, 13254, 292, 16003, 16001, 16000, 7421, 292, 336, 55, 43, 192, 98, 701, 7, 823, 57, 38, 53, 3, 16002, 13254, 292, 16007, 16001, 16000, 7421, 292, 147, 5703, 7, 24, 19, 116, 2065, 3, 45, 41, 51, 7, 294, 51, 3, 4, 58, 162, 91, 5703, 6, 22, 19, 150, 52, 883, 153, 124, 16002, 13254, 292, 16006, 16001, 16000, 7421, 292, 188, 4690, 1993, 676, 6518, 9263, 1200, 1073, 10, 423, 1155, 1456, 3312, 3933, 1338, 8014, 4, 3934, 3780, 3973, 9092, 1722, 3105, 124, 16002, 13254, 292, 16004, 16001, 16000,

In [83]:
if torch.cuda.is_available():
    print("CUDA is available!")
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"Current CUDA device name: {torch.cuda.get_device_name(0)}")
    device = 'cuda'
else:
    print("CUDA is not available. PyTorch will use the CPU.")
    device = 'cpu'

CUDA is available!
CUDA device count: 1
Current CUDA device name: NVIDIA GeForce RTX 4080 SUPER


In [84]:
class GPTBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super().__init__()

        self.ln1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(
            embed_dim, num_heads, dropout=dropout, batch_first=True
        )

        self.ln2 = nn.LayerNorm(embed_dim)

        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout),
        )

    def forward(self, x, causal_mask, padding_mask):
        # Self-attention (GPT-style)
        h = self.ln1(x)
        attn_out, _ = self.attn(
            h, h, h,
            attn_mask=causal_mask,
            key_padding_mask=padding_mask,
            need_weights=False
        )
        x = x + attn_out

        # Feedforward
        h = self.ln2(x)
        ff_out = self.mlp(h)
        x = x + ff_out

        return x
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, max_len=512,
                 embed_dim=1024, num_heads=8,
                 num_layers=4, mlp_dim=2048, dropout=0.2):
        super().__init__()

        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.pos = nn.Embedding(max_len, embed_dim)

        self.layers = nn.ModuleList([
            GPTBlock(embed_dim, num_heads, mlp_dim, dropout)
            for _ in range(num_layers)
        ])

        self.ln_final = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size, bias=False)
        #self.head.weight = self.embed.weight  # weight tying

    #def causal_mask(self, T, device):
    #    mask = torch.triu(torch.ones(T, T, device=device), 1)
    #    return mask * float("-inf")

    def causal_mask(self, T, device): 
        mask = torch.triu(torch.ones(T, T, device=device), 1) 
        return mask.masked_fill(mask == 1, float('-inf'))



    
    def forward(self, x, lengths=None):
        B, T = x.shape
        device = x.device

        tok = self.embed(x)
        pos = self.pos(torch.arange(T, device=device)[None, :])
        h = tok + pos

        causal = self.causal_mask(T, device)     # (T, T)
        pad_mask = (x == 0)                      # (B, T)

        for layer in self.layers:
            h = layer(h, causal, pad_mask)

        h = self.ln_final(h)
        return self.head(h)                      # (B, T, V)


In [85]:
# Training function
def train_model(model, train_loader, epochs=10, lr=1e-4, device='cuda'):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    track_loss = []
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        loader = tqdm(train_loader)
        
        for x, y, lengths in loader:
            x, y = x.to(device), y.to(device)
            lengths = lengths.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            logits = model(x, lengths)
            
            # Reshape for loss calculation
            logits = logits.view(-1, logits.size(-1))
            y = y.view(-1)
            
            loss = criterion(logits, y)
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            total_loss += loss.item()
            track_loss.append(loss.item())
            avg_loss = sum(track_loss[-10:]) / 10
            loader.set_postfix(loss=avg_loss)
            del logits
            torch.cuda.empty_cache()
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}')
    
    return model




In [86]:
dataset = TokenDatasetB(train_seqs_x, train_seqs_y)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fnB)


# Initialize model (set vocab_size to your tokenizer's vocab size + special tokens)
vocab_size = 16000+9  # Adjust based on your tokenizer
model = DecoderOnlyTransformer(vocab_size=vocab_size,  num_layers=12)

In [87]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")

Total parameters: 670924800


In [88]:
# Train
model = train_model(model, train_loader, epochs=20, lr=1e-4)


  0%|                                                   | 0/100 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 15.55 GiB of which 40.31 MiB is free. Process 11637 has 4.98 GiB memory in use. Including non-PyTorch memory, this process has 10.08 GiB memory in use. Of the allocated memory 9.61 GiB is allocated by PyTorch, and 167.02 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [33]:
k = [345, 23, 44, 99]
kt = torch.tensor(k)

In [34]:
seq_len

512

In [35]:
anno, score = getEncodingOpen(test_df, random.randint(0,len(test_df)))

In [36]:
annot = torch.tensor(anno)

  annot = torch.tensor(anno)


In [37]:
decode_seq(anno.tolist())

'<BOS>Review: These briefs run a bit small. I would order one size larger for a better fit. I also felt that they were to thin.<SEP>Rating:'

In [38]:
annot

tensor([16000,  7421,   292,    96,  1617,   166,     8,   120,    78,     3,
            4,    69,   138,    62,    30,   205,    11,     8,   176,    26,
            3,     4,   172,   516,    25,    22,    63,     9,   387,     3,
        16002, 13254,   292])

In [39]:
score

tensor([1])

In [40]:
lens = len(anno)
anno_len = torch.tensor([lens])

# Pad sequences
padded_x = torch.zeros(1, 512, dtype=torch.long) + pad_token
padded_x[0,:lens] = torch.tensor(anno)

padded_x = padded_x.to(device)
anno_len = anno_len.to(device)

  padded_x[0,:lens] = torch.tensor(anno)


In [41]:
padded_x[0].shape

torch.Size([512])

In [42]:
decode_seq(padded_x[0].tolist())

'<BOS>Review: These briefs run a bit small. I would order one size larger for a better fit. I also felt that they were to thin.<SEP>Rating:PADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPAD

In [43]:
out = model(padded_x, anno_len)

In [44]:
out.shape

torch.Size([1, 512, 16009])

In [45]:
tokenout = torch.argmax(out,dim=2)[0]

In [46]:
score

tensor([1])

In [47]:
tokenout.shape

torch.Size([512])

In [48]:
decode_seq(tokenout.tolist())

'Review: I are way way 34 too<SEP>I have buy product to if than me woman fit good<SEP>love very like are just perfect send that<SEP>Rating:<1>PADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPAD

In [49]:
def predict_label_ids(model, x, pad_token):
    model.eval()
    with torch.no_grad():
        logits = model(x)                    # (B, T, V)
        nonpad_len = (x != pad_token).sum(dim=1)   # (B,)
        last_pos = nonpad_len - 1
        last_logits = logits[torch.arange(x.size(0)), last_pos]   # (B, V)
        probs = last_logits.softmax(dim=-1)
        top5 = probs.topk(5, dim=-1)
        return top5, probs[:, 16003:16008]   # top5 and probs specifically for label tokens

# Example:
#x = torch.LongTensor([padded_x]).to(device)
top5, label_probs = predict_label_ids(model, padded_x, pad_token)
print("label_probs:", label_probs.cpu().numpy())
print("top5 ids:", top5.indices.cpu().numpy())

label_probs: [[0.2941319  0.405071   0.17341492 0.11416022 0.01129261]]
top5 ids: [[16004 16003 16005 16006 16007]]


In [50]:
class ICLDataset(Dataset):
    def __init__(self, df, shots, seq_len):
        self.df = df
        self.shots = shots
        self.seq_len = seq_len
    
    def __len__(self):
        return len(self.df)

    def getpre(self, idx, l):
        for _ in range(10):
            ixs = []
            xs = []
            for j in range(self.shots):
                sel = idx
                while(sel==idx):
                    sel = random.randint(0, self.__len__() -1)
                xs.extend(getEncoding(self.df, sel))
            if(len(xs) + l <= seq_len):
                return torch.LongTensor(xs)
        raise ValueError(f"can't fit {self.shots} examples in context")
                
            
    
    def __getitem__(self, idx):
        x, y = getEncodingOpen(self.df, idx)
        l = len(x)
        pre = self.getpre(idx, l)
        icl_x = torch.cat((pre,x))
        return icl_x, y

# Padding collate function for variable length sequences
def collate_fn_icl(batch):
    seqs_x, y = zip(*batch)
    lens = [len(s) for s in seqs_x]
    max_len = max(lens)
    
    # Pad sequences
    padded_x = torch.zeros(len(seqs_x), max_len, dtype=torch.long) + pad_token
    
    for i, x in enumerate(seqs_x):
        padded_x[i, :len(x)] = x
    
    return padded_x, torch.LongTensor(y), torch.LongTensor(lens)



In [51]:
icl_dataset = ICLDataset(train_df, 0, seq_len)
icl_loader = DataLoader(icl_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_icl)

In [52]:
for batch_idx, (x, y, lengths) in enumerate(icl_loader):
    print("bi")
    print(batch_idx)
    print("x")
    print(x.shape)
    xl = x[0].tolist()
    print(xl)
    print()
    
    
    print(decode_seq(xl))
    print("y")
    print(y)
    print(y.shape)
    print("lengths")
    print(lengths.shape)
    break

bi
0
x
torch.Size([8, 34])
[16000, 7421, 292, 95, 182, 133, 14, 3, 16002, 13254, 292, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008]

<BOS>Review: My son loves it.<SEP>Rating:PADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPAD
y
tensor([4, 2, 1, 3, 3, 3, 0, 2])
torch.Size([8])
lengths
torch.Size([8])


In [53]:
def israting(s):
    bnk = ["0","1","2","3","4"]
    return s in bnk
def isnum(s):
    bnk = [str(n) for n in range(0,20)]
    return s in bnk

def check_token_list(token_list):
    isratings = 0
    isnums = 0
    for token in token_list:
        isratings += 1 if israting(token) else 0
        isnums += 1 if isnum(token) else 0
    return isratings, isnums

In [54]:
d

{16000: '<BOS>',
 16001: '<EOS>',
 16002: '<SEP>',
 16003: '<0>',
 16004: '<1>',
 16005: '<2>',
 16006: '<3>',
 16007: '<4>',
 16008: 'PAD'}

In [55]:
def get_score(x):
    x = x.tolist()
    for i in range(len(x)-1,0, -1):
        if 16003<= x[i] <=16007:
            return x[i]
    return 20000


In [56]:
test_ex = torch.tensor([ 7421,   292,     4,     6,   945,    35,   161,    19,    31,   161,
             6,    18,   161,    19,    10,    29,    26,   143,   204,     3,
            28,   321,    14,   143,    75,   460, 13254,   292, 16000, 16008,
         16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008,
         16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008,
         16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008, 16008,
         16008])
vv = get_score(test_ex)

In [57]:
vv

20000

In [58]:
def getTokenScore(t):
    if(t==20000):
        return '-'
    return d[t][1]

In [59]:
icl_dataset = ICLDataset(train_df, 0, seq_len)
icl_loader = DataLoader(icl_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_icl)

In [60]:
tot = 0
num = 0
ratings = 0
correct = 0
for batch_idx, (x, y, lengths) in enumerate(icl_loader):
    x = x.to(device)
    y = y.to(device)
    lengths = lengths.to(device)
    pred_logits = model(x,lengths)
    print(x.shape)
    print(x[0])
    for sequence in x:
        print(decode_seq(sequence.tolist()))
        print()
    print("========================")
    pred_tokens = torch.argmax(pred_logits,dim=2)
    pred_tokens = pred_tokens.to('cpu')
    #pred_tokens = trim_tail(pred_tokens, pad_token)
    #pred_last_token = pred_tokens[:,-1].tolist()
    pred_last_token = [get_score(i) for i in pred_tokens]
    for sequence in pred_tokens:
        print(decode_seq(sequence.tolist()))
        print()
    pred_scores = []
    print(pred_last_token)
    for token in pred_last_token:
        if(token<=15999):
            pred_scores.append(tok.decode([token]))
        else:
            pred_scores.append(getTokenScore(token))

    print('---')
    print(pred_scores)
    
    ys = [str(ans) for ans in y.tolist()]
    print(ys)
    tot += len(y)
    for t in range(len(y)):
        correct += 1 if pred_scores[t] == ys[t] else 0
    isratings, isnums = check_token_list(pred_scores)
    num += isnums
    ratings += isratings
    torch.cuda.empty_cache()
    break

torch.Size([8, 22])
tensor([16000,  7421,   292,  1087,     9,    46,   869,    35,     9,    26,
            3, 16002, 13254,   292, 16008, 16008, 16008, 16008, 16008, 16008,
        16008, 16008], device='cuda:0')
<BOS>Review: Seem to be okay as to fit.<SEP>Rating:PADPADPADPADPADPADPADPAD

<BOS>Review: Material is very bad quality. It does not look anything like the pictures online.<SEP>Rating:

<BOS>Review: I wear a 36 had to get a 38<SEP>Rating:PADPADPADPADPADPADPAD

<BOS>Review: Had to return them because they didn't fit my wifes foot. :(<SEP>Rating:

<BOS>Review: The product was not as described.<SEP>Rating:PADPADPADPADPADPADPADPADPAD

<BOS>Review: 2-3 sizes to small<SEP>Rating:PADPADPADPADPADPADPADPADPADPADPADPAD

<BOS>Review: good hose. lasts more than once like some of the cheaper ones out there.<SEP>Rating:

<BOS>Review: These were too baggy for my taste. They do look good though.<SEP>Rating:PADPAD

Review: Is run well. I put.<SEP>Rating:<0>PADPADPADPADPADPADPADPAD

Review: I

In [61]:
print("tot: ", tot)
print("num: ", num)
print("ratings: ", ratings)
print("correct: ", correct)

tot:  8
num:  8
ratings:  8
correct:  1


In [65]:
icl_dataset = ICLDataset(test_df, 0, seq_len)
icl_loader = DataLoader(icl_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_icl)

In [66]:
tot = 0
num = 0
ratings = 0
correct = 0
for batch_idx, (x, y, lengths) in enumerate(icl_loader):
    x = x.to(device)
    y = y.to(device)
    lengths = lengths.to(device)
    pred_logits = model(x,lengths)
    #print(x.shape)
    #print(x[0])
    #for sequence in x:
    #    print(decode_seq(sequence.tolist()))
    #    print()
    #print("========================")
    pred_tokens = torch.argmax(pred_logits,dim=2)
    pred_tokens = pred_tokens.to('cpu')
    #pred_tokens = trim_tail(pred_tokens, pad_token)
    #pred_last_token = pred_tokens[:,-1].tolist()
    pred_last_token = [get_score(i) for i in pred_tokens]
    #for sequence in pred_tokens:
    #    print(decode_seq(sequence.tolist()))
    #    print()
    pred_scores = []
    #print(pred_last_token)
    for token in pred_last_token:
        if(token<=15999):
            pred_scores.append(tok.decode([token]))
        else:
            pred_scores.append(getTokenScore(token))

    #print('---')
    #print(pred_scores)
    
    ys = [str(ans) for ans in y.tolist()]
    #print(ys)
    tot += len(y)
    for t in range(len(y)):
        correct += 1 if pred_scores[t] == ys[t] else 0
    isratings, isnums = check_token_list(pred_scores)
    num += isnums
    ratings += isratings
    torch.cuda.empty_cache()

In [67]:
print("tot: ", tot)
print("num: ", num)
print("ratings: ", ratings)
print("correct: ", correct)
print("acc: ", correct/num)

tot:  2300
num:  2300
ratings:  2300
correct:  844
acc:  0.36695652173913046


In [None]:
5358/15387 #0

In [None]:
 4778 / 15396