In [1]:
from unsloth import FastModel
import torch
import torch.nn as nn
from datasets import load_dataset
import re
from trl import GRPOConfig, GRPOTrainer
from transformers import (
    GPT2Model,
    GPT2Tokenizer,
    GPT2PreTrainedModel,
    GPT2Config,
    Trainer,
    TrainingArguments,
    AutoModelForCausalLM,
    TextStreamer,
 AutoTokenizer
)
from typing import Dict, List
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm
from datasets import Dataset as HFDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import datetime
import time
from sklearn.preprocessing import StandardScaler
import pickle
from transformers import GPT2LMHeadModel, GPT2Tokenizer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-29 23:19:40 [__init__.py:256] Automatically detected platform cuda.


In [2]:
df = pd.read_csv('summaries.csv')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
#String processing
def remove_before_char(text, char):
  """Removes the portion of the string before the first occurrence of the specified character.

  Args:
    text: The input string.
    char: The character to search for.

  Returns:
    The modified string, or the original string if the character is not found.
  """
  index = text.find(char)
  if index != -1:
    return text[index:]
  return text

In [20]:
# Format Data for GPT2
base = []
pess = []
ml = 0 
for index, row in df.iterrows():
    # Process the prompt
    prompt = row['Prompt']
    index = prompt.find('Post')
    if index != -1: #check if substring exists
        prompt = prompt[index+5:]
        prompt = prompt[: len(prompt)-8]
    
    pre = '''Which of the following summaries does a better job of summarizing the most important points in the given forum post, without including unimportant or irrelevant details? Judge based on accuracy, coverage, and coherence.
    Post: ''' + prompt +  ''' 
    Summary A: ''' + row['Human'] + ''' 
    Summary B: ''' 
    
    post = '''
    Which summary is better? Respond with only <A> or <B>.
    _____________________________________________________________________________________________________________'''
    base.append(pre + row['Base'] + post)
    pess.append(pre + row['Pessimism'] + post)
    ml = max(ml, len(base[-1].split(' ')), len(pess[-1].split(' ')))

In [31]:
# Format Data for GPT2
text = []
ml = 0 
for index, row in df.iterrows():
    # Process the prompt
    prompt = row['Prompt']
    index = prompt.find('Post')
    if index != -1: #check if substring exists
        prompt = prompt[index+5:]
        prompt = prompt[: len(prompt)-8]
    
    pre = '''Which of the following summaries does a better job of summarizing the most important points in the given forum post, without including unimportant or irrelevant details? Judge based on accuracy, coverage, and coherence.
    Post: {''' + prompt +  '''} 
    Summary A: {''' + row['Base'] + '''} 
    
    Summary B: {''' + row['Pessimism'] + '''}

    
    Which summary is better? Respond with only <A> or <B>.
    -------------------------------------------------------
    '''
    text.append(pre)
    ml = max(ml, len(text[-1].split(' ')))

In [5]:
# # Load GPT-2 Large model and tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
# model = GPT2LMHeadModel.from_pretrained('gpt2-large')
# # Add padding token if missing
# tokenizer.pad_token = tokenizer.eos_token

In [6]:
max_new_tokens = 8
max_seq_length = 800 + max_new_tokens

In [7]:
# Load in Gemma3 to evaluate if the generated summary is better than the human one

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-it",
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.to(device)
model.eval()

==((====))==  Unsloth 2025.4.7: Fast Gemma3 patching. Transformers: 4.51.3. vLLM: 0.8.1.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.999 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=1152, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1152, out_features=256, bias=False)
          (v_proj): Linear(in_features=1152, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1152, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (up_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=1152, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma3RMSNorm((11

In [8]:
text = base[:5] + pess[:5]
inputs = tokenizer(
    text, # Process a list of prompts
    return_tensors="pt",
    padding=True, # Pad to the longest sequence in the batch
    truncation=True,
    max_length=max_seq_length - max_new_tokens # Make space for generated text
).to(device)

with torch.no_grad(): # Disable gradient calculation for inference
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=max_new_tokens,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,      # Use sampling
        temperature=0.25,     # Lower temperature for less randomness
        top_p=.9,           # Nucleus sampling
        num_return_sequences=1,
        min_length = 1,
    )
generated_summaries = []
# Decode each summary in the batch
# outputs contains the full sequence (prompt + summary)
# We need to slice off the prompt part for each generated summary
input_ids_length = inputs.input_ids.shape[1] # Length of the tokenized input prompts (padded)
for i in range(outputs.shape[0]): # Iterate through each item in the batch
    
    summary_ids = outputs[i][input_ids_length:]
    summary = tokenizer.decode(summary_ids, skip_special_tokens=True)
    generated_summaries.append(summary.strip())


In [9]:
def check_first(text):
    one_index = text.find("A")
    two_index = text.find("B")

    if one_index == -1 and two_index == -1:
        return -1
    elif one_index == -1:
        return 2
    elif two_index == -1:
        return 1
    elif one_index < two_index:
        return 1
    else:
        return 2

In [10]:
for a in generated_summaries:
    print(check_first(a))

1
1
1
1
1
1
1
1
1
1


In [11]:
print(generated_summaries)

['<A> My husband is lazy', '<A> <B>', '<A>\n<B>', '<A>\n<B>', '<A> <B>', '<A>\n<B>', '<A>\n<B>', '<A> is better.', '<A>\n    <', '<A>\n<B>']


In [21]:
text = base[:100] + pess[:100]

In [33]:
with open('../whome/Downloads/summ.txt', 'w') as f:
    for line in text[:100]:
        f.write("%s\n" % line)

In [34]:
print(text[20])

Which of the following summaries does a better job of summarizing the most important points in the given forum post, without including unimportant or irrelevant details? Judge based on accuracy, coverage, and coherence.
    Post: {
This may be obvious to some, but it has helped me get a much better picture of my finances. For years I struggled with paying bills, sometimes because of low income but just as often because I was disorganized. After making simple changes to my bank account structures I don't miss bills, have lowered my spending and can understand my expenses better.

The trick is pretty simple. Open a checking account for all your reoccurring expenses. Then go about setting them all on auto-pay. Most of the time you can connect the bank numbers or use a debit card to automate payment. If you need to pay an individual or have to pay by check most banks have a bill pay feature that will send scheduled payments by check.

At first you'll need to overpay into this account becau