In [1]:
import os
from google.colab import userdata

os.environ["HUGGINGFACE_TOKEN"] = userdata.get("HUGGINGFACE_TOKEN")

In [2]:
!pip install -U peft trl datasets accelerate transformers bitsandbytes

Collecting peft
  Downloading peft-0.9.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.7.11-py3-none-any.whl (155 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.3/155.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m


In [3]:
import pandas as pd
import os
import torch

# Fine Tuning Frameworks
from peft import LoraConfig
from trl import SFTTrainer

from datasets import Dataset, DatasetDict

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)

In [4]:
# This function helps format the dataset, so that it can be used for fine-tuning.
# We will use alpaca style formatting for this project
def create_instruction(row):
    instruction_template = """Explain detail this python code.
### Instruction: {}
### Response: {}""".format(
        row["output"], row["input"]
    )
    return instruction_template

In [5]:
from datasets import load_dataset

dataset = load_dataset("flytech/python-codes-25k", split="train")

# One can map the dataset in any way, for the sake of example:
df = pd.DataFrame(dataset)
df.head()

Downloading readme:   0%|          | 0.00/3.10k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.4M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Unnamed: 0,output,instruction,text,input
0,```python\ntasks = []\nwhile True:\n task =...,Help me set up my daily to-do list!,Help me set up my daily to-do list! Setting up...,Setting up your daily to-do list...
1,```python\nshopping_list = {}\nwhile True:\n ...,Create a shopping list based on my inputs!,Create a shopping list based on my inputs! Cre...,Creating a shopping list...
2,"```python\ntotal_time = 0\nfor i in range(1, 8...",Calculate how much time I spend on my phone pe...,Calculate how much time I spend on my phone pe...,Calculating weekly phone usage...
3,```python\ntotal_bill = float(input('Enter the...,Help me split the bill among my friends!,Help me split the bill among my friends! Split...,Splitting the bill...
4,```python\nmovie_list = {}\nwhile True:\n g...,Organize my movie list into genres!,Organize my movie list into genres! Organizing...,Organizing your movie list...


In [6]:
# To convert to alpaca we need to add text column
df["text"] = df.apply(create_instruction, axis=1)
df

Unnamed: 0,output,instruction,text,input
0,```python\ntasks = []\nwhile True:\n task =...,Help me set up my daily to-do list!,Explain detail this python code.\n### Instruct...,Setting up your daily to-do list...
1,```python\nshopping_list = {}\nwhile True:\n ...,Create a shopping list based on my inputs!,Explain detail this python code.\n### Instruct...,Creating a shopping list...
2,"```python\ntotal_time = 0\nfor i in range(1, 8...",Calculate how much time I spend on my phone pe...,Explain detail this python code.\n### Instruct...,Calculating weekly phone usage...
3,```python\ntotal_bill = float(input('Enter the...,Help me split the bill among my friends!,Explain detail this python code.\n### Instruct...,Splitting the bill...
4,```python\nmovie_list = {}\nwhile True:\n g...,Organize my movie list into genres!,Explain detail this python code.\n### Instruct...,Organizing your movie list...
...,...,...,...,...
49621,c.execute('''CREATE TABLE IF NOT EXISTS stocks...,Execute code: c.execute('''CREATE TABLE IF NOT...,Explain detail this python code.\n### Instruct...,Creating a 'stocks' table...
49622,"c.execute(""INSERT INTO stocks VALUES ('2023-09...","Execute code: c.execute(""INSERT INTO stocks VA...",Explain detail this python code.\n### Instruct...,Inserting a record into the 'stocks' table...
49623,c.execute('SELECT * FROM stocks WHERE symbol=?...,Execute code: c.execute('SELECT * FROM stocks ...,Explain detail this python code.\n### Instruct...,Querying the 'stocks' table for records with s...
49624,c.execute('UPDATE stocks SET qty=120 WHERE sym...,Execute code: c.execute('UPDATE stocks SET qty...,Explain detail this python code.\n### Instruct...,Updating the 'stocks' table to change quantity...


In [7]:
df["text"].iloc[[3]].values[0]

"Explain detail this python code.\n### Instruction: ```python\ntotal_bill = float(input('Enter the total bill amount: '))\nfriends_count = int(input('Enter the number of friends: '))\nper_person = total_bill / friends_count\nprint(f'Each person should pay {per_person}')\n```\n### Response: Splitting the bill..."

In [28]:
# Convert dataset to hugging face format
dataset = Dataset.from_pandas(df[["output", "input", "text"]][:1000])

In [9]:
# Model Defining
base_model = "google/gemma-2b"
new_model = "google/gemma-2b-python_code_100"

In [10]:
# Using Lora to tune only limited parameters keeping resources in mind
lora_config = LoraConfig(
    r=8,
    target_modules=[
        "q_proj",
        "o_proj",
        "k_proj",
        "v_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model, token=os.environ["HUGGINGFACE_TOKEN"]
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map=device,
    token=os.environ["HUGGINGFACE_TOKEN"],
)

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [14]:
def generate(model, tokenizer, text, device, token_size=1024):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=token_size)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [15]:
# Sample Conversation 1
# Complexity : Mid Level
text = "python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n"

print(generate(model, tokenizer, text, device))

python
tasks = []
while True:
    task = input('Enter a task or type 'done' to finish: ')
    if task == 'done': break
    tasks.append(task)
print(f'Your to-do list for today: {tasks}')



In [16]:
# Sample Conversation 2
# Complexity : Low Level
text = "with torch . no_grad (): a -= learning_rate * a . grad b -= learning_rate * b . grad c -= learning_rate * c . grad d -= learning_rate * d ."
print(generate(model, tokenizer, text, device))

with torch . no_grad (): a -= learning_rate * a . grad b -= learning_rate * b . grad c -= learning_rate * c . grad d -= learning_rate * d . grad

<code>def __init__(self, a, b, c, d):
    self.a = a
    self.b = b
    self.c = c
    self.d = d

    self.a_grad = torch . Tensor (a . grad)
    self.b_grad = torch . Tensor (b . grad)
    self.c_grad = torch . Tensor (c . grad)
    self.d_grad = torch . Tensor (d . grad)

    self.a_grad = torch . Tensor (a . grad)
    self.b_grad = torch . Tensor (b . grad)
    self.c_grad = torch . Tensor (c . grad)
    self.d_grad = torch . Tensor (d . grad)

    self.a_grad = torch . Tensor (a . grad)
    self.b_grad = torch . Tensor (b . grad)
    self.c_grad = torch . Tensor (c . grad)
    self.d_grad = torch . Tensor (d . grad)

    self.a_grad = torch . Tensor (a . grad)
    self.b_grad = torch . Tensor (b . grad)
    self.c_grad = torch . Tensor (c . grad)
    self.d_grad = torch . Tensor (d . grad)

    self.a_grad = torch . Tensor (a . grad)
   

In [17]:
# Sample Conversation 3
# Comlexity : Mid-high Level
text = "model = torch . nn . Sequential ( torch . nn . Linear ( 3 , 1 ), torch . nn . Flatten ( 0 , 1 ) ) loss_fn = torch . nn . MSELoss ( reduction = 'sum' ) what is Topic of this code?"
print(generate(model, tokenizer, text, device))

model = torch . nn . Sequential ( torch . nn . Linear ( 3 , 1 ), torch . nn . Flatten ( 0 , 1 ) ) loss_fn = torch . nn . MSELoss ( reduction = 'sum' ) what is Topic of this code?

Answer:

The code is a model that takes in a
3-dimensional input and outputs a 1-dimensional output.


In [29]:
# Defining all Training hyperparameters, Epoch = 2, optimizer = 8 bit to save memory
# If given more time, can be trained on 5-6 epochs for better results
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    optim="adamw_bnb_8bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

In [30]:
# Trainer Configuration
trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



In [31]:
# Used to clear cache memory for a fresh start
torch.cuda.empty_cache()

In [32]:
# Train
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
25,1.2125
50,0.9414
75,0.851
100,0.9253
125,0.7235
150,0.8704
175,0.7641
200,0.7537
225,0.6722
250,0.7866



Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Repo model google/gemma-2b is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/gemma-2b.

Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Repo model google/gemma-2b is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/gemma-2b.

Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Repo model google/gemma-2b is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/gemma-2b.

Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Repo model google/gemma-2b is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/ge

TrainOutput(global_step=1000, training_loss=0.7394724607467651, metrics={'train_runtime': 640.6144, 'train_samples_per_second': 1.561, 'train_steps_per_second': 1.561, 'total_flos': 1028847427276800.0, 'train_loss': 0.7394724607467651, 'epoch': 1.0})

In [33]:
# Saving the model
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)


Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Repo model google/gemma-2b is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/gemma-2b.


('google/gemma-2b-python_code_100/tokenizer_config.json',
 'google/gemma-2b-python_code_100/special_tokens_map.json',
 'google/gemma-2b-python_code_100/tokenizer.model',
 'google/gemma-2b-python_code_100/added_tokens.json',
 'google/gemma-2b-python_code_100/tokenizer.json')

In [43]:
import os

path = new_model
for file in os.listdir(path):
    file_size = os.path.getsize(os.path.join(path, file))
    print(f"{file}: {file_size} bytes")

    # 원하는 단위로 변환
    # KB 단위
    file_size_kb = file_size / 1024
    print(f"{file}: {file_size_kb} KB")

    # MB 단위
    file_size_mb = file_size_kb / 1024
    print(f"{file}: {file_size_mb} MB")

tokenizer.model: 4241003 bytes
tokenizer.model: 4141.6044921875 KB
tokenizer.model: 4.0445356369018555 MB
adapter_model.safetensors: 39256456 bytes
adapter_model.safetensors: 38336.3828125 KB
adapter_model.safetensors: 37.43787384033203 MB
tokenizer.json: 17477652 bytes
tokenizer.json: 17068.01953125 KB
tokenizer.json: 16.667987823486328 MB
README.md: 5091 bytes
README.md: 4.9716796875 KB
README.md: 0.004855155944824219 MB
adapter_config.json: 686 bytes
adapter_config.json: 0.669921875 KB
adapter_config.json: 0.0006542205810546875 MB
tokenizer_config.json: 1108 bytes
tokenizer_config.json: 1.08203125 KB
tokenizer_config.json: 0.001056671142578125 MB
special_tokens_map.json: 555 bytes
special_tokens_map.json: 0.5419921875 KB
special_tokens_map.json: 0.0005292892456054688 MB


In [44]:
torch.cuda.empty_cache()

In [45]:
tokenizer = AutoTokenizer.from_pretrained(new_model)
model = AutoModelForCausalLM.from_pretrained(
    new_model,
    quantization_config=bnb_config,
    device_map=device,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [46]:
# Sample Conversation 1
# Complexity : Mid Level
text = "python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n"

print(generate(model, tokenizer, text, device))

python
tasks = []
while True:
    task = input('Enter a task or type 'done' to finish: ')
    if task == 'done': break
    tasks.append(task)
print(f'Your to-do list for today: {tasks}')



In [47]:
# Sample Conversation 2
# Complexity : Low Level
text = "with torch . no_grad (): a -= learning_rate * a . grad b -= learning_rate * b . grad c -= learning_rate * c . grad d -= learning_rate * d ."
print(generate(model, tokenizer, text, device))

with torch . no_grad (): a -= learning_rate * a . grad b -= learning_rate * b . grad c -= learning_rate * c . grad d -= learning_rate * d . grad
with torch . no_grad (): a -= learning_rate * a . grad b -= learning_rate * b . grad c -= learning_rate * c . grad d -= learning_rate * d . grad
with torch . no_grad (): a -= learning_rate * a . grad b -= learning_rate * b . grad c -= learning_rate * c . grad d -= learning_rate * d . grad
with torch . no_grad (): a -= learning_rate * a . grad b -= learning_rate * b . grad c -= learning_rate * c . grad d -= learning_rate * d . grad
with torch . no_grad (): a -= learning_rate * a . grad b -= learning_rate * b . grad c -= learning_rate * c . grad d -= learning_rate * d . grad
with torch . no_grad (): a -= learning_rate * a . grad b -= learning_rate * b . grad c -= learning_rate * c . grad d -= learning_rate * d . grad
with torch . no_grad (): a -= learning_rate * a . grad b -= learning_rate * b . grad c -= learning_rate * c . grad d -= learning_r

In [None]:
# Sample Conversation 3
# Comlexity : Mid-high Level
text = "model = torch . nn . Sequential ( torch . nn . Linear ( 3 , 1 ), torch . nn . Flatten ( 0 , 1 ) ) loss_fn = torch . nn . MSELoss ( reduction = 'sum' ) what is Topic of this code?"
print(generate(model, tokenizer, text, device))