In [1]:
# ### Install the needed libraries
# Avoiding versions mismatch
!pip install -q accelerate==0.29.3
!pip install -q bitsandbytes==0.43.1
!pip install -q trl==0.8.6
!pip install -q peft==0.10.0
!pip install -q transformers==4.40.0

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [3]:
# The model that you want to train from the Hugging Face hub
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"


################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type to use
bnb_4bit_quant_type = "nf4"

# Activate double quantization
use_nested_quant = False

In [4]:
################################################################################
# Load the model
################################################################################

device_map="auto"   # Automatically distribute layers across GPUs


# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [5]:
# add the hf_token into your Colab Secrets
from google.colab import userdata
hf_token = userdata.get('hf_token')

# Load the model

In [6]:
# Load DeepSeek model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    use_auth_token=hf_token  # Pass your Hugging Face token here
)
model.config.use_cache = False
model.config.pretraining_tp = 1



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [7]:
# Load Qwen tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                          use_auth_token=hf_token)

# Set them properly: Qwen
#tokenizer.pad_token = tokenizer.eos_token



tokenizer_config.json:   0%|          | 0.00/3.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
#this function returns the outputs from the model received, and inputs.
def get_outputs(model, inputs, max_new_tokens=200):
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        repetition_penalty=1.1,
        early_stopping=True, #Can stop before reach the max_length
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=False
    )
    return outputs

In [9]:
def query_model(system, user_input, max_new_tokens = 200):
  messages = [{"role": "system",
                "content": system},
                {"role": "user",
                "content": user_input}
                ]

  prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

  #Inference original model
  model_input = tokenizer(prompt, return_tensors="pt").to('cuda')

  foundational_outputs_sentence = get_outputs(model, model_input, max_new_tokens=max_new_tokens)
  output = tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True)
  return output

# Student Grader System

In [10]:
System = """
You are a teacher who is grading 10th grade students, master in computer science.
In your grading consider the class level of students.
You will receive multiple student's answer to a question, and
your task is to grade the answer on a scale from 1 to 10.
The grading should be based on the student's class level, and include feedback for improvement.
After that, please return the true answer for the question, base on your knowledge (in summary).
"""

Query = """
Question: What is a Large Language Model (LLM)?

Student 1: A Large Language Model is something that talks to people and answers questions.

Student 2: A Large Language Model is a type of AI that uses text data to understand and generate human language.

Student 3: Large Language Models are AI systems trained on vast amounts of text data to perform a variety of natural language processing tasks, such as text generation, translation, summarization, and question answering.
"""


In [11]:
result = query_model(System, Query, 1000)





In [12]:
print(type(result))
print(result)

<class 'list'>
['\nYou are a teacher who is grading 10th grade students, master in computer science.\nIn your grading consider the class level of students.\nYou will receive multiple student\'s answer to a question, and\nyour task is to grade the answer on a scale from 1 to 10.\nThe grading should be based on the student\'s class level, and include feedback for improvement.\nAfter that, please return the true answer for the question, base on your knowledge (in summary).\n<｜User｜>\nQuestion: What is a Large Language Model (LLM)?\n\nStudent 1: A Large Language Model is something that talks to people and answers questions.\n\nStudent 2: A Large Language Model is a type of AI that uses text data to understand and generate human language.\n\nStudent 3: Large Language Models are AI systems trained on vast amounts of text data to perform a variety of natural language processing tasks, such as text generation, translation, summarization, and question answering.\n<｜Assistant｜><think>\nOkay, so 

In [13]:
print("\n".join(result))


You are a teacher who is grading 10th grade students, master in computer science.
In your grading consider the class level of students.
You will receive multiple student's answer to a question, and
your task is to grade the answer on a scale from 1 to 10.
The grading should be based on the student's class level, and include feedback for improvement.
After that, please return the true answer for the question, base on your knowledge (in summary).
<｜User｜>
Question: What is a Large Language Model (LLM)?

Student 1: A Large Language Model is something that talks to people and answers questions.

Student 2: A Large Language Model is a type of AI that uses text data to understand and generate human language.

Student 3: Large Language Models are AI systems trained on vast amounts of text data to perform a variety of natural language processing tasks, such as text generation, translation, summarization, and question answering.
<｜Assistant｜><think>
Okay, so I need to figure out how to grade t