**
Installing the Transformers library via pip allows you to access powerful tools and models for natural language processing tasks, including GPT-2. It's a standard step in setting up projects involving text data analysis and processing.**

In [57]:
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [58]:
import re
import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch.optim as optim

In [59]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [60]:
import csv
reviews_path = "/kaggle/input/final-dataset-12334/Reviews.csv"

reviews = []

# Open the CSV file in read mode with UTF-8 encoding
with open(reviews_path, "r", encoding="utf-8") as reviews_raw:
    # Create a CSV reader object
    csv_reader = csv.DictReader(reviews_raw)
    
    # Iterate over each row in the CSV file
    for row in csv_reader:
        # Extract the summary and text from the current row
        summary = row["Summary"]
        text = row["Text"]
        
        # Replace " = " with " TL;DR " in summary and text
        summary = summary.replace(" = ", " TL;DR ")
        text = text.replace(" = ", " TL;DR ")
        
        # Combine summary and text into one string
        review = f"{text.strip()} = {summary.strip()}\n"
        
        # Append the combined review to the list
        reviews.append(review)


In [61]:
reviews[:3]

['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most. = Good Quality Dog Food\n',
 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo". = Not as Advertised\n',
 'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother an

In [62]:
len(reviews)

568454

In [63]:
reviews[5]

"I don't know if it's the cactus or the tequila or just the unique combination of ingredients, but the flavour of this hot sauce makes it one of a kind!  We picked up a bottle once on a trip we were on and brought it back home with us and were totally blown away!  When we realized that we simply couldn't find it anywhere in our city we were bummed.<br /><br />Now, because of the magic of the internet, we have a case of the sauce and are ecstatic because of it.<br /><br />If you love hot sauce..I mean really love hot sauce, but don't want a sauce that tastelessly burns your throat, grab a bottle of Tequila Picante Gourmet de Inclan.  Just realize that once you taste it, you will never want to use any other sauce.<br /><br />Thank you for the personal, incredible service! = The Best Hot Sauce in the World\n"

In [64]:
# Calculate the average length of reviews in terms of the number of words

# Calculate the total length of all reviews by summing the number of words in each review
total_length = sum([len(review.split()) for review in reviews])

# Calculate the average length by dividing the total length by the number of reviews
# len(reviews) returns the number of reviews
avg_length = total_length / len(reviews)

# Display 
avg_length


85.37717211946789

In [65]:
max_length = 100

In [66]:
from transformers import AutoTokenizer, AutoModelWithLMHead

# Import necessary classes from the Transformers library

# Initialize tokenizer with pre-trained weights from GPT-2 model
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Initialize GPT-2 model with pre-trained weights
model = AutoModelWithLMHead.from_pretrained("gpt2")

# Move the model to the specified device (e.g., GPU)
model = model.to(device)

# Initialize AdamW optimizer for updating model parameters during training
optimizer = optim.AdamW(model.parameters(), lr=3e-4)


In [69]:
tokenizer.encode(" TL;DR ")

[24811, 26, 7707, 220]

In [70]:
# Calculate the length of the tokenized representation of the string " TL;DR "
extra_length = len(tokenizer.encode(" TL;DR ")) 


In [72]:
class ReviewDataset(Dataset):
    def __init__(self, tokenizer, reviews, max_len):
        # Initialize dataset with tokenizer, reviews, and maximum sequence length
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token  # End of sequence token
        self.eos_id = self.tokenizer.eos_token_id  # End of sequence token ID
        self.reviews = reviews
        self.result = []

        # Process each review in the dataset
        for review in self.reviews:
            # Encode the review text using tokenizer.encode() and add end of sequence token
            tokenized = self.tokenizer.encode(review + self.eos)
            
            # Pad/truncate the encoded sequence to max_len
            padded = self.pad_truncate(tokenized)            

            # Convert the padded sequence to a PyTorch tensor and add to the result list
            self.result.append(torch.tensor(padded))

    def __len__(self):
        # Return the total number of samples in the dataset
        return len(self.result)

    def __getitem__(self, item):
        # Retrieve a sample from the dataset by index
        return self.result[item]

    def pad_truncate(self, name):
        # Pad or truncate the input sequence to match the specified max_len
        
        # Calculate the length of the tokenized review, excluding the length of " TL;DR "
        name_length = len(name) - extra_length
        
        if name_length < self.max_len:
            # If the length is less than max_len, pad the sequence with EOS tokens
            difference = self.max_len - name_length
            result = name + [self.eos_id] * difference
        elif name_length > self.max_len:
            # If the length exceeds max_len, truncate the sequence and add EOS token
            result = name[:self.max_len + 3] + [self.eos_id]  # Adding 3 to account for extra tokens added during padding
        else:
            # If the length matches max_len, return the original sequence
            result = name
        
        return result

# Create an instance of the ReviewDataset class
dataset = ReviewDataset(tokenizer, reviews, max_length)


# DATALOADER

In [74]:
# Create a DataLoader to batch and shuffle the dataset for training
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)

# Define a function for training the model
def train(model, optimizer, dl, epochs):
    # Iterate over the specified number of epochs
    for epoch in range(epochs):
        # Iterate over each batch in the DataLoader
        for idx, batch in enumerate(dl):
            # Enable gradient computation for the current batch
            with torch.set_grad_enabled(True):
                # Reset gradients to zero
                optimizer.zero_grad()
                
                # Move the batch to the specified device (e.g., GPU)
                batch = batch.to(device)
                
                # Forward pass: compute model predictions
                output = model(batch, labels=batch)
                
                # Compute the loss based on model predictions
                loss = output[0]
                
                # Backward pass: compute gradients and update model parameters
                loss.backward()
                optimizer.step()
                
                # Print loss every 50 batches
                if idx % 50 == 0:
                    print("loss: %f, %d"%(loss, idx))


In [76]:
# Initiate the training process for the specified model, optimizer, dataloader, and number of epochs
train(model=model, optimizer=optimizer, dl=dataloader, epochs=1)

# The train function iterates over each epoch and within each epoch, it processes batches of data.
# For each batch, it computes the model output, calculates the loss, and updates the model parameters based on the optimizer.
# The loss value is printed every 50 batches to monitor the training progress.


loss: 7.179998, 0
loss: 2.546208, 50
loss: 2.173442, 100
loss: 2.687057, 150
loss: 2.299516, 200
loss: 2.323481, 250
loss: 2.308963, 300
loss: 2.470415, 350
loss: 2.384151, 400
loss: 2.411604, 450
loss: 2.191579, 500
loss: 2.710379, 550
loss: 2.356698, 600
loss: 2.104545, 650
loss: 2.092360, 700
loss: 2.215420, 750
loss: 2.406816, 800
loss: 2.368682, 850
loss: 2.637617, 900
loss: 2.189571, 950
loss: 2.308575, 1000
loss: 2.238402, 1050
loss: 2.063868, 1100
loss: 2.344211, 1150
loss: 2.464975, 1200
loss: 2.153612, 1250
loss: 2.138024, 1300
loss: 2.524029, 1350
loss: 2.249254, 1400
loss: 2.285464, 1450
loss: 2.017530, 1500
loss: 1.820162, 1550
loss: 2.174565, 1600
loss: 2.552882, 1650
loss: 2.148380, 1700
loss: 2.165045, 1750
loss: 2.298380, 1800
loss: 2.125666, 1850
loss: 2.262254, 1900
loss: 2.293321, 1950
loss: 2.087833, 2000
loss: 2.001413, 2050
loss: 2.193791, 2100
loss: 2.232748, 2150
loss: 2.245459, 2200
loss: 2.175688, 2250
loss: 2.203779, 2300
loss: 2.333107, 2350
loss: 2.174418,

In [86]:
def topk(probs, n=9):
    # Softmax the scores to convert them into probabilities
    probs = torch.softmax(probs, dim=-1)
    
    # Use PyTorch's topk method to get the top k probabilities and their corresponding indices
    tokensProb, topIx = torch.topk(probs, k=n)
    
    # Normalize the new selection pool (9 choices)
    tokensProb = tokensProb / torch.sum(tokensProb)

    # Send to CPU for numpy handling
    tokensProb = tokensProb.cpu().detach().numpy()

    # Make a random choice from the pool based on the new probability distribution
    choice = np.random.choice(n, 1, p=tokensProb)
    tokenId = topIx[choice][0]

    return int(tokenId)

def model_infer(model, tokenizer, review, max_length=15):
    # Preprocess the initial token (task designator)
    review_encoded = tokenizer.encode(review)
    result = review_encoded
    initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)

    with torch.set_grad_enabled(False):
        # Feed the initial token to the model
        output = model(initial_input)

        # Flatten the logits at the final time step
        logits = output.logits[0, -1]

        # Make a top-k choice and append to the result
        result.append(topk(logits))

        # For max_length times:
        for _ in range(max_length):
            # Feed the current sequence to the model and make a choice
            input = torch.tensor(result).unsqueeze(0).to(device)
            output = model(input)
            logits = output.logits[0, -1]
            res_id = topk(logits)

            # If the chosen token is EOS, return the result
            if res_id == tokenizer.eos_token_id:
                return tokenizer.decode(result)
            else:  # Append to the sequence
                result.append(res_id)
    # If no EOS is generated, return after reaching the max_len
    return tokenizer.decode(result)

# The topk function selects the top k tokens based on their probabilities and returns one of them randomly.
# The model_infer function generates a sequence of tokens using the given model and tokenizer, starting with the provided review.
# It iteratively predicts the next token based on the previous sequence and stops when the EOS token is encountered or after reaching the max_length.
# This function returns the generated sequence as a decoded string.


In [89]:
# Select a random sample of 5 reviews from the list of reviews and split each review at " TL;DR " to extract the original review text
sample_reviews = [review.split(" TL;DR ")[0] for review in random.sample(reviews, 5)]

# Display the sampled review texts
sample_reviews


["I love Coconut and the flavor is really good, but it was much sweeter than expected. I like to have a healthy mid morning snack at work and this is just a little too much to serve that purpose. I have enjoyed having one before a workout to get some extra energy but I won't be buying these on a regular basis. = YUMMY, but too sweet for an Every Day snack\n",
 'There\'s no getting around the fact that this is canned meat and gravy, and so take that into account.  It\'s going to have a bit of canned flavor, smell and texture -- not like Grandma\'s homemade pot roast.<br /><br />But considering that, this is a convenient and pretty well done product for a canned food.  There is a good balance of roast beef chunks to gravy.  I would say you definitely get two servings per can and, depending on use, three or even four servings.  (I can\'t remember what the can claims as number of servings but I think it was two per can.)  A hungry person or big eater might want to consume the entire can.<b

In [91]:
# Iterate over each review in the sample_reviews list
for review in sample_reviews:
    # Initialize a set to store unique summaries generated for each review
    summaries = set()
    
    # Print the original review text
    print(review)
    
    # Generate summaries until there are at least 3 unique summaries
    while len(summaries) < 3:
        # Generate a summary for the current review using the model_infer function
        # Append " TL;DR " to the review to indicate the start of summary generation
        summary = model_infer(model, tokenizer, review + " TL;DR ").split(" TL;DR ")[1].strip()
        
        # Add the generated summary to the set of summaries if it's not already present
        if summary not in summaries:
            summaries.add(summary)
    
    # Print the generated summaries for the current review
    print("Summaries: " + str(summaries) + "\n")


I love Coconut and the flavor is really good, but it was much sweeter than expected. I like to have a healthy mid morning snack at work and this is just a little too much to serve that purpose. I have enjoyed having one before a workout to get some extra energy but I won't be buying these on a regular basis. = YUMMY, but too sweet for an Every Day snack



TypeError: 'NoneType' object is not callable

********# EARLIER ROUGE 

In [77]:
from torch.utils.data import random_split

# Split the dataset into training and testing sets with a ratio of 75% for training and 25% for testing
train_size = int(0.75 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create data loaders for the training and testing sets
# DataLoader for training set: batch size of 32, shuffling the data, and dropping the last incomplete batch if any
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
# DataLoader for testing set: batch size of 32, without shuffling the data
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# The training dataset is split into a training set and a testing set with a ratio of 75:25.
# DataLoaders are created for both the training and testing sets, facilitating the efficient loading of batches during training and evaluation.
# The training DataLoader shuffles the data and drops the last incomplete batch to ensure consistent batch sizes during training.
# The testing DataLoader does not shuffle the data since the order of data is not relevant during evaluation.




# Define the training function for the model
def train(model, optimizer, train_dl, epochs):
    # Iterate over the specified number of epochs
    for epoch in range(epochs):
        model.train()  # Set the model to training mode
        # Iterate over each batch in the training DataLoader
        for idx, batch in enumerate(train_dl):
            optimizer.zero_grad()  # Reset gradients to zero
            batch = batch.to(device)  # Move batch to the specified device (e.g., GPU)
            output = model(batch, labels=batch)  # Forward pass: compute model predictions
            loss = output[0]  # Compute the loss based on model predictions
            loss.backward()  # Backward pass: compute gradients
            optimizer.step()  # Update model parameters based on gradients
            # Print loss every 50 batches
            if idx % 50 == 0:
                print(f"Epoch [{epoch+1}/{epochs}], Step [{idx+1}/{len(train_dl)}], Loss: {loss.item()}")



In [79]:
# Train the model on the training set
train(model, optimizer, train_dataloader, epochs=1)


Epoch [1/1], Step [1/13323], Loss: 1.8593367338180542
Epoch [1/1], Step [51/13323], Loss: 1.9994953870773315
Epoch [1/1], Step [101/13323], Loss: 1.8977000713348389
Epoch [1/1], Step [151/13323], Loss: 2.0506272315979004
Epoch [1/1], Step [201/13323], Loss: 1.5420185327529907
Epoch [1/1], Step [251/13323], Loss: 1.4611223936080933
Epoch [1/1], Step [301/13323], Loss: 2.0511529445648193
Epoch [1/1], Step [351/13323], Loss: 1.8443139791488647
Epoch [1/1], Step [401/13323], Loss: 2.04577898979187
Epoch [1/1], Step [451/13323], Loss: 2.006317615509033
Epoch [1/1], Step [501/13323], Loss: 1.8634902238845825
Epoch [1/1], Step [551/13323], Loss: 2.206110715866089
Epoch [1/1], Step [601/13323], Loss: 1.6728296279907227
Epoch [1/1], Step [651/13323], Loss: 1.805537223815918
Epoch [1/1], Step [701/13323], Loss: 1.817780613899231
Epoch [1/1], Step [751/13323], Loss: 2.183117389678955
Epoch [1/1], Step [801/13323], Loss: 2.004399538040161
Epoch [1/1], Step [851/13323], Loss: 1.8522589206695557
Epo

In [82]:
model.load_state_dict(torch.load(model_path))
model.eval()  # Set the model to evaluation mode if needed

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [102]:
# pip install rouge


In [101]:
%%capture
!pip install evaluate
!pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [103]:
import evaluate
rouge = evaluate.load('rouge')


In [131]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [138]:
# Initialize lists to store Rouge scores for each batch
rouge1 = []
rouge2 = []
rougeL = []
rougeLsum = []

# Set the model to evaluation mode
model.eval()

# Iterate over each batch in the test DataLoader
for batch in test_dataloader:
    # Move the batch to the specified device (e.g., GPU)
    batch = batch.to(device)
    
    # Disable gradient computation during inference
    with torch.no_grad():
        # Forward pass: compute model predictions
        outputs = model(batch)

    # Get the logits from the model outputs
    logits = outputs.logits
    
    # Get the predicted tokens by selecting the token index with the highest probability
    predictions = torch.argmax(logits, dim=-1)
    
    # Decode the generated output sequence, removing special tokens
    generated_output = tokenizer.decode(predictions[0], skip_special_tokens=True)
    
    # Decode the reference sequence from the batch, removing special tokens
    references = tokenizer.decode(batch[0], skip_special_tokens=True)
    
    # Compute Rouge scores between the generated output and reference sequences
    results = rouge.compute(predictions=[generated_output], references=[references])
    
    # Append the Rouge scores to the respective lists
    rouge1.append(results['rouge1'])
    rouge2.append(results['rouge2'])
    rougeL.append(results['rougeL'])
    rougeLsum.append(results['rougeLsum'])


print(f'rouge1: {sum(rouge1)/len(rouge1)}\nrouge2: {sum(rouge2)/len(rouge2)}\nrougeL: {sum(rougeL)/len(rougeL)}\nrougeLsum: {sum(rougeLsum)/len(rougeLsum)}')

rouge1: 0.4086952614760332
rouge2: 0.0852688926207635
rougeL: 0.2926001694066451
rougeLsum: 0.297603059631863


## Inference

In [159]:
inp = input("Enter Review text: ")
model.eval()

# Tokenize the input review text using the tokenizer
# Set 'truncation=True' to truncate the input sequence if it exceeds the maximum length
# 'return_tensors="pt"' returns PyTorch tensors
# Move the input tensors to the specified device (e.g., GPU)
inp = tokenizer(inp, truncation=True, return_tensors="pt").to(device)

# Generate a summary for the input review using the model
# 'generate' method generates sequences based on the input tensor
# 'max_length=60' specifies the maximum length of the generated sequence
output = model.generate(inp['input_ids'], max_length=60)
generated_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(f'Generate Summary: \n{generated_output}')



# Import necessary libraries
from rouge import Rouge
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Define functions to calculate ROUGE scores and generate summaries

# Function to calculate ROUGE scores between a generated summary and a reference summary
def calculate_rouge(hypothesis, reference):
    rouge = Rouge()
    scores = rouge.get_scores(hypothesis, reference)
    return scores[0]

# Function to generate a summary for a given review text using the GPT-2 model
def generate_summary(review_text):
    # Tokenize the review text and encode it into input tensors
    inputs = tokenizer.encode("summarize: " + review_text, return_tensors="pt", max_length=1024, truncation=True)
    
    # Generate summary using the GPT-2 model
    summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    
    # Decode the generated summary from the output tensor
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Iterate over each row in the reviews DataFrame to generate summaries and calculate ROUGE scores
for index, row in reviews_path.iterrows():
    # Extract review text and reference summary 
    review_text = row["Review Text"]
    reference_summary = row["Summary"]
    generated_summary = generate_summary(review_text)
    
    # Calculate ROUGE scores between the generated summary and reference summary
    rouge_scores = calculate_rouge(generated_summary, reference_summary)
    print("Review Text:", review_text)
    print("Reference Summary:", reference_summary)
    print("Generated Summary:", generated_summary)
    print("ROUGE Scores:", rouge_scores)
    print("="*50)


Enter Review text:  The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners. It has a solid construction, produces a rich sound, and feels comfortable to play. However, some users have reported issues with the tuning stability.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generate Summary: 
The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners. It has a solid construction, produces a rich sound, and feels comfortable to play. However, some users have reported issues with the tuning stability.

The Fender CD-60S Dreadn


In [97]:
def topk(probs, n=9):
    # The scores are initially softmaxed to convert to probabilities
    probs = torch.softmax(probs, dim= -1)
    
    # PyTorch has its own topk method, which we use here
    tokensProb, topIx = torch.topk(probs, k=n)
    
    # The new selection pool (9 choices) is normalized
    tokensProb = tokensProb / torch.sum(tokensProb)

    # Send to CPU for numpy handling
    tokensProb = tokensProb.cpu().detach().numpy()

    # Make a random choice from the pool based on the new prob distribution
    choice = np.random.choice(n, 1, p = tokensProb)
    tokenId = topIx[choice][0]

    return int(tokenId)

def model_infer(model, tokenizer, review, max_length=15):
    # Preprocess the init token (task designator)
    review_encoded = tokenizer.encode(review)
    result = review_encoded
    initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)

    with torch.set_grad_enabled(False):
        # Feed the init token to the model
        output = model(initial_input)

        # Flatten the logits at the final time step
        logits = output.logits[0,-1]

        # Make a top-k choice and append to the result
        result.append(topk(logits))

        # For max_length times:
        for _ in range(max_length):
            # Feed the current sequence to the model and make a choice
            input = torch.tensor(result).unsqueeze(0).to(device)
            output = model(input)
            logits = output.logits[0,-1]
            res_id = topk(logits)

            # If the chosen token is EOS, return the result
            if res_id == tokenizer.eos_token_id:
                return tokenizer.decode(result)
            else: # Append to the sequence 
                result.append(res_id)
    # IF no EOS is generated, return after the max_len
    return tokenizer.decode(result)

sample_reviews = [review.split(" TL;DR ")[0] for review in random.sample(reviews, 5)]
sample_reviews

for review in sample_reviews:
    summaries = set()
    print(review)
    while len(summaries) < 3:
        summary = model_infer(model, tokenizer, review + " TL;DR ").split(" TL;DR ")[1].strip()
        if summary not in summaries:
            summaries.add(summary)
    print("Summaries: "+ str(summaries) +"\n")