In [49]:
from unsloth import FastLanguageModel
import torch
import os
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import ollama
from datasets import load_dataset, Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_SEQ_LEN = 4540 # Choose any! We auto support RoPE Scaling internally!
DTYPE = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
LOAD_IN_4BIT = True

For each model we will generate two histories based on the same data, one will come from the finetuned model and the other from the base model. Then, by asking deepseek r1 14b which story it thinks is better, we will verify the success of the project. Deepseek r1 is based on super long CoT so we can see the motivation behind the choices made.

### Fidning model paths

In [50]:
models_parths = []
for item in os.listdir(os.getcwd()):
    item_path = os.path.join(os.getcwd(), item)
    if os.path.isdir(item_path) and item.startswith("saved_model"):
        models_parths.append(item_path)
models_parths

['/home/patryk/Documents/tests/saved_model_summary',
 '/home/patryk/Documents/tests/saved_model_keywords_counter_top5',
 '/home/patryk/Documents/tests/saved_model_keywords_td_idf_only_nouns_top3',
 '/home/patryk/Documents/tests/saved_model_keywords_td_idf_top5',
 '/home/patryk/Documents/tests/saved_model_keywords_td_idf_top3',
 '/home/patryk/Documents/tests/saved_model_keywords_counter_top3',
 '/home/patryk/Documents/tests/saved_model_keywords_td_idf_only_nouns_top5']

In [51]:
os.listdir('/home/patryk/Documents/tests')

['output_keywords_td_idf_top3',
 'output_keywords_counter_top5',
 'training_notebook.ipynb',
 'output_keywords_td_idf_only_nouns_top5',
 'saved_model_summary',
 'training_notebook copy.ipynb',
 'output_keywords_td_idf_top5',
 'abx.ipynb',
 'output_keywords_counter_top3',
 'output_summary',
 'output_keywords_td_idf_only_nouns_top3',
 'saved_model_keywords_counter_top5',
 'saved_model_keywords_td_idf_only_nouns_top3',
 'wandb',
 'saved_model_keywords_td_idf_top5',
 'saved_model_keywords_td_idf_top3',
 'saved_model_keywords_counter_top3',
 'saved_model_keywords_td_idf_only_nouns_top5',
 'requirements',
 'stories_summaries_keywords.csv',
 '_unsloth_temporary_saved_buffers']

### Getting test set

In [52]:
data_df = pd.read_csv("stories_summaries_keywords.csv")
data_df.rename(columns={"text": "story"}, inplace=True)
data_df['story'] = data_df['story'].str.replace('\n', ' ')
data_df['story_length'] = data_df['story'].apply(len)
data_df['story_length'].describe() # decyzja: packing=True vs uzywanie data_collator

INPUT_COLUMNS = ['summary', 'keywords_counter_top5', 'keywords_counter_top3', 'keywords_td_idf_top5', 'keywords_td_idf_top3', 'keywords_td_idf_only_nouns_top5', 'keywords_td_idf_only_nouns_top3']

In [53]:
dataset = Dataset.from_pandas(data_df)
test = dataset.train_test_split(test_size=0.2, seed=42)['test']

In [54]:
test[0]

{'story': 'One day, a boy named Peter went to the park. He ran around, playing with a ball and having fun. Suddenly, he saw a big sausage on the ground!  He went to pick it up, but instead he saw a friendly dog. It was a small, brown puppy called Lucy. She barked happily as she saw Peter and wagged her tail.  Peter was very surprised. He remembered his mum saying that he was not allowed to play with dogs. But Lucy seemed very nice, so Peter wanted to stay and play.  He held out his hand and Lucy came over. She felt very soft. Peter smiled and said hello. He even scratched Lucy behind the ears. Lucy was so pleased that she rolled over so Peter could wave her tummy.  Lucy was so smart. She even did a few tricks for Peter. He laughed and gave her the sausage he found. She gobbled it up and wagged her tail harder.   Peter and Lucy were soon best friends. They would go to the park every day to play and wave together.',
 'summary': 'A young boy named Peter befriends a friendly, intelligent, 

### ABX code

In [56]:
def load_model_and_tokenizer(model_path):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = MAX_SEQ_LEN + 200,
        dtype = DTYPE,
        load_in_4bit = LOAD_IN_4BIT,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inferenceFastLanguageModel.for_inference(model) # Enable native 2x faster inference
    return model, tokenizer

In [57]:
tinyllama_model, tinyllama_tokenizer = load_model_and_tokenizer("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T")

==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.66 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [58]:
def generate_text(model, tokenizer, prompt, max_new_tokens = MAX_SEQ_LEN):
    inputs = tokenizer(
    [
        prompt
    ], return_tensors = "pt").to(device)

    output_ids = model.generate(**inputs, max_new_tokens = max_new_tokens)
    decoded = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return decoded[0]

In [59]:
def get_story(outut):
    return outut.split("Story:")[1].strip()

In [60]:
def get_column(model_path):
    for column in INPUT_COLUMNS:
        if model_path.endswith(column):
            return column

In [55]:
TEST_NUM = 10

In [82]:
import gc

In [83]:
scores = {model: [0, 0] for model in INPUT_COLUMNS} # [saved_model, tinyllama]
for model_path in models_parths:

    saved_model, saved_tokenizer = load_model_and_tokenizer(model_path)
    print(f"Model {model_path} loaded.")

    for i in range(TEST_NUM):
        col_name = get_column(model_path)
        INSTRUCTION = "Summary:\n" if "summary" in model_path else "Keywords:\n"

        saved_model_story_promt = f"{INSTRUCTION}{', '.join(eval(test[i][col_name])) if col_name != 'summary' else test[i][col_name]}Story:\n"
        tinyllama_prompt = f"Write a short story based on {'this summary:\n' if 'summary' in model_path else 'these keywords:\n'}{', '.join(eval(test[i][col_name])) if col_name != 'summary' else test[i][col_name]}\nStory:\n"

        saved_model_story = get_story(generate_text(saved_model, saved_tokenizer, saved_model_story_promt))
        tinyllama_story = get_story(generate_text(tinyllama_model, tinyllama_tokenizer, tinyllama_prompt, max_new_tokens=600)) # zapetla sie - dlatego

        # moze dodatkowo odwracac numeracje? - nie no based model jest nieporownywalnie slabszy nei ma co
        deepseekr1_prompt = f"You will get two stories 1 and 2. Your task is to choose the story that you think is better. Write the number of your chosen story as your answer. Your asnwer must be 1 or 2 and nothing more. Write just a single number.\n\n1. {saved_model_story}\n\n2. {tinyllama_story}\n\nYour answer:\n"
        res = ollama.generate(keep_alive=5, model='deepseek-r1:14b', prompt=deepseekr1_prompt)['response']
        if "1" in res.split("</think>")[1].strip(): # 1 is better
            scores[col_name][0] += 1
        elif "2" in res.split("</think>")[1].strip():
            scores[col_name][1] += 1

    saved_model = None
    saved_tokenizer = None
    gc.collect()   
    torch.cuda.empty_cache()


==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.66 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model /home/patryk/Documents/tests/saved_model_summary loaded.
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.66 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red co

In [84]:
scores

{'summary': [10, 0],
 'keywords_counter_top5': [9, 0],
 'keywords_counter_top3': [9, 0],
 'keywords_td_idf_top5': [10, 0],
 'keywords_td_idf_top3': [9, 0],
 'keywords_td_idf_only_nouns_top5': [10, 0],
 'keywords_td_idf_only_nouns_top3': [10, 0]}

we see that in every case deepseek r1 chooses finetuned model over the base line

Here is an example of deepseek r1 reasoning for the last pair of sotries

In [85]:
res

'<think>\nOkay, so I\'m trying to figure out which story is better between the two given. Let me read them carefully and analyze each one step by step.\n\nStarting with Story 1: It\'s about a little girl named Lily who loves playing with her dolls and has a favorite red car. One day, her mom gives her a new finger with a shiny star, and Lily plays with it all day. Then her friend comes over and wants to play with the finger too, which Lily allows. However, when her mom sees this, she scolds Lily, saying the finger isn\'t for playing and that it\'s meant for her dolls. This makes Lily sad, and she argues with her mom about wanting to keep playing with the finger instead of her dolls. In the end, the mom explains that while Lily loves the finger, it\'s not appropriate for playtime, so Lily learns an important lesson about respecting others\' belongings and being kind.\n\nNow looking at Story 2: The title mentions a girl who is a friend of a boy, but when I read through it, all the senten