<a href="https://colab.research.google.com/github/utkarshgupta04092003/notebooks/blob/main/create-neural-network/Pretraining_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Install dependencies and fix seed


In [17]:
!pip install \
  accelerate==0.26.1 \
  datasets==2.16.1 \
  fasttext==0.9.2 \
  jupyter==1.0.0 \
  pandas==2.2.0 \
  pyarrow==15.0.0 \
  sentencepiece==0.1.99 \
  torch==2.1.2 \
  torchaudio==2.1.2 \
  torchvision==0.16.2 \
  tqdm==4.66.1 \
  transformers==4.37.2


Collecting accelerate==0.26.1
  Downloading accelerate-0.26.1-py3-none-any.whl.metadata (18 kB)
Collecting datasets==2.16.1
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting fasttext==0.9.2
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jupyter==1.0.0
  Downloading jupyter-1.0.0-py2.py3-none-any.whl.metadata (995 bytes)
Collecting pandas==2.2.0
  Downloading pandas-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting pyarrow==15.0.0
  Downloading pyarrow-15.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting sentencepiece==0.1.99
  Downloading sentencepiece-0.1.99.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metad

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import torch

In [4]:
def fix_torch_seed(seed=42):
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic=True
  torch.backends.cudnn.benchmark=False

fix_torch_seed()

## 2. Load a general pretrained model

In [5]:
model_path_or_name = "upstage/TinySolar-248m-4k"

In [None]:
from transformers import AutoModelForCausalLM
tiny_general_model = AutoModelForCausalLM.from_pretrained(
    model_path_or_name,
    device_map="cpu", # change to auto if you have access to a GPU
    torch_dtype=torch.bfloat16
)

In [None]:
from transformers import AutoTokenizer
tiny_general_tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)

## 3. Generate Text Sample

In [16]:
prompt = "I am an engineer. I love"

In [17]:
inputs = tiny_general_tokenizer(prompt, return_tensors="pt")

In [18]:
from transformers import TextStreamer
streamer = TextStreamer(
    tiny_general_tokenizer,
    skip_prompt=True, # If you set to false, the model will first return the prompt and then the generated text
    skip_special_tokens=True
)

In [19]:
outputs = tiny_general_model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.1
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


to travel and have a great time, but I'm not sure if I can do it all again.
I've been working on my first book for the last 10 years. It's called "The Secret Life of Pets" and it is about a man named John who has just finished his second year at college. He is a very good student and he wants to be a writer. He also wants to write a novel. So, I decided to start writing this book.
I started with a story in the middle of the night and then I wrote it down. I was so excited that I had


## 4. Generate Python samples with pretrained general model

In [20]:
prompt =  "def find_max(numbers):"

In [21]:
inputs = tiny_general_tokenizer(prompt, return_tensors="pt")

In [22]:
from transformers import TextStreamer
streamer = TextStreamer(
    tiny_general_tokenizer,
    skip_prompt=True, # If you set to false, the model will first return the prompt and then the generated text
    skip_special_tokens=True
)

In [23]:
outputs = tiny_general_model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.1
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



       """
       Returns the number of times a user has been added to the list.
       """
       return num_users() + 1

   def get_user_id(self, id):
       """
       Returns the number of users that have been added to the list.
       """
       return len(self.get_users())

   def get_user_name(self, name):
       """
       Returns the name of the user that has been added to the list.
       """
       return self.get_user_name(name)



## 5. Generate Python samples with finetuned Python model

In [33]:
model_path_or_name = "upstage/TinySolar-248m-4k-code-instruct"

In [39]:
tiny_finetuned_model = AutoModelForCausalLM.from_pretrained(
    model_path_or_name,
    device_map="cpu",
    torch_dtype=torch.bfloat16,
)

tiny_finetuned_tokenizer = AutoTokenizer.from_pretrained(
    model_path_or_name
)

In [40]:
prompt =  "def find_max(numbers):"

In [41]:
from transformers import TextStreamer
streamer = TextStreamer(
    tiny_general_tokenizer,
    skip_prompt=True, # If you set to false, the model will first return the prompt and then the generated text
    skip_special_tokens=True
)

In [42]:
outputs = tiny_general_model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.1
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



       """
       Returns the number of times a user has been added to the list.
       """
       return num_users() + 1

   def get_user_id(self, id):
       """
       Returns the number of users that have been added to the list.
       """
       return len(self.get_users())

   def get_user_name(self, name):
       """
       Returns the name of the user that has been added to the list.
       """
       return self.get_user_name(name)



## 6. Generate Python samples with pretrained Python model


In [43]:
model_path_or_name = "upstage/TinySolar-248m-4k-py"

In [45]:
tiny_custom_model=AutoModelForCausalLM.from_pretrained(
    model_path_or_name,
    device_map='cpu',
    torch_dtype=torch.bfloat16,
)

tiny_custom_tokenizer=AutoTokenizer.from_pretrained(
    model_path_or_name
)

In [46]:
prompt = "def find_max(numbers):"


In [47]:
inputs = tiny_custom_tokenizer(
    prompt, return_tensors="pt"
).to(tiny_custom_model.device)

In [49]:
streamer=TextStreamer(
    tiny_custom_tokenizer,
    skip_prompt=True,
    skip_special_tokens=True
)

In [54]:
outputs=tiny_custom_model.generate(
    **inputs, streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False,
    repetition_penalty=1.1
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



   """Find the maximum number of numbers in a list."""
   max = 0
   for num in numbers:
       if num > max:
           max = num
   return max


def get_min_max(numbers, min_value=1):
   """Get the minimum value of a list."""
   min_value = min_value or 1
   for num in numbers:
       if num < min_value:
           min_value = num
   return min_value



## Test Generated Code

In [52]:
def get_min_max(numbers, min_value=1):
   """Get the minimum value of a list."""
   min_value = min_value or 1
   for num in numbers:
       if num < min_value:
           min_value = num
   return min_value

In [55]:
get_min_max([1,5,4,8,6,2,4,7])

1

In [7]:
import json
import nbformat
from google.colab import _message

# Get notebook JSON from Colab
nb_json = _message.blocking_request('get_ipynb')['ipynb']

# Convert to nbformat object
nb = nbformat.from_dict(nb_json)

# Remove widget metadata if present
if "widgets" in nb.metadata:
    nb.metadata.pop("widgets")

# Push the cleaned notebook back into Colab memory
cleaned = nbformat.writes(nb)
_message.blocking_request('set_ipynb', {'ipynb': json.loads(cleaned)})

print("✅ Cleaned! Now use File → Save a copy to GitHub again.")


✅ Cleaned! Now use File → Save a copy to GitHub again.
