<a href="https://colab.research.google.com/github/ymoslem/Adaptive-MT-LLM/blob/main/other/davinci-finetuning-cost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cost estimation of fine-tuning Davinci GPT-3 model  

See more [MT tutorials and notebooks](https://github.com/ymoslem). 

# Download sample files

In [None]:
# Download sample files

# Source
!wget -q --show-progress  https://raw.githubusercontent.com/ymoslem/Adaptive-MT-LLM/main/data/tico-19/tico-19-enes-dedup.en
# Target
!wget -q --show-progress https://raw.githubusercontent.com/ymoslem/Adaptive-MT-LLM/main/data/tico-19/tico-19-enes-dedup.es

In [None]:
!ls

# tiktoken installation

In [None]:
!pip3 install tiktoken -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m1.0/1.7 MB[0m [31m33.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip3 freeze | grep tiktoken

tiktoken==0.4.0


# Tokenization functions

In [None]:
import tiktoken

def encode(text):
  enc = tiktoken.encoding_for_model("davinci")
  encoded = enc.encode(text)
  # print(enc.n_vocab)
  return encoded

encode("This is a test!")

[1212, 318, 257, 1332, 0]

In [None]:
# Batch encoding function

import functools
from concurrent.futures import ThreadPoolExecutor

def encode_batch(batch, num_threads=32):
  encoder = functools.partial(encode)
  with ThreadPoolExecutor(num_threads) as e:
    return list(e.map(encoder, batch))

In [None]:
# Test batch encoding
batch = ["Hello!", "this is a test.", "What about this somehow longer sentence?"]
encode_batch(batch)

[[15496, 0],
 [5661, 318, 257, 1332, 13],
 [2061, 546, 428, 7599, 2392, 6827, 30]]

# Tokenizing the dataset and counting tokens

When you prepare the training dataset, you might need to add more tokens for prompts, separators, etc. So, while you can have a rough estimation using a raw dataset, it is better to conduct this estimation after data preparation. See the official [fine-tuning guide](https://platform.openai.com/docs/guides/fine-tuning/advanced-usage).

In [None]:
# Count tokens in the source and target datasets

# Change the source and target files
source_file = "tico-19-enes-dedup.en"
target_file = "tico-19-enes-dedup.es"

# Tokenize the dataset and count tokens
with open(source_file) as src, open(target_file) as tgt:
  src_sents = src.readlines()
  tgt_sents = tgt.readlines()

  print(f"Number of sentences: {len(src_sents)} + {len(tgt_sents)} = {len(src_sents) + len(tgt_sents)} \n")

  src_sents_encoded = encode_batch(src_sents)
  tgt_sents_encoded = encode_batch(tgt_sents)

  out_src = []
  for src_sent_encoded in src_sents_encoded:
    out_src.extend(src_sent_encoded)
  

  out_tgt = []
  for tgt_sent_encoded in tgt_sents_encoded:
    out_tgt.extend(tgt_sent_encoded)
  
  print(f"Source tokens: {len(out_src)}")
  print(f"Target tokens: {len(out_tgt)}")
  print(f"\nTotal tokens {len(out_src) + len(out_tgt)}")

Number of sentences: 3070 + 3070 = 6140 

Source tokens: 96949
Target tokens: 183786

Total tokens 280735


# Cost estimation

In [None]:
# Estimate cost of fine-tuning for 1 epoch

tokens = len(out_src) + len(out_tgt)
unit = 1000  # tokens
unit_price = 0.03
cost_per_epoch = round((tokens / unit) * unit_price, 2)

print(f"Cost estimation for 1 epoch: {cost_per_epoch} USD (excluding VAT)")

Cost estimation for 1 epoch: 8.42 USD (excluding VAT)


In [None]:
# Estimate cost of fine-tuning for a number of epochs

# Change the number of training epochs
n_epochs = 4

total_cost = cost_per_epoch * n_epochs 

print(f"Cost estimation for {n_epochs} epochs: {total_cost} USD (excluding VAT)")

Cost estimation for 4 epochs: 33.68 USD (excluding VAT)
