In [4]:
!pip install --quiet openai

# ChatBot

In [1]:
import openai
from openai import AzureOpenAI

import os
import requests

from dotenv import load_dotenv

In [2]:
load_dotenv('/home/sagemaker-user/user-default-efs/CLONED_REPOS/LLM-World/.env')

def my_gpt(model, ask_a_question):
    api_response = requests.get(os.getenv('AZURE_OPENAI_BASEURL'))
    payload = api_response.json()

    myClient = AzureOpenAI(
        api_key = os.getenv('AZURE_OPENAI_KEY'),
        api_version = '2024-12-01-preview',
        azure_endpoint = payload['nonprod'][model][0]['endpoint']
    )

    convo = []
    convo.append({'role':'user','content':ask_a_question})
    response = myClient.chat.completions.create(
    model = model,
    max_tokens = 30000,
    messages = convo,
    temperature = 0)

    reply = response.choices[0].message.content
    convo.append({'role':'assistant','content':reply})

    return convo

In [3]:
response = requests.get(os.getenv('AZURE_OPENAI_BASEURL'))
returned_payload = response.json()

list_of_models = [models for models in enumerate(returned_payload['nonprod'].keys())]
print(list_of_models)

[(0, 'dall-e-3'), (1, 'gpt-35-turbo'), (2, 'gpt-35-turbo-instruct'), (3, 'gpt-4-turbo'), (4, 'gpt-4.1'), (5, 'gpt-4.1-mini'), (6, 'gpt-4.1-nano'), (7, 'gpt-4.5'), (8, 'gpt-4o-audio'), (9, 'gpt-4o-global'), (10, 'gpt-4o-mini-global'), (11, 'gpt-4o-regional'), (12, 'gpt-5'), (13, 'gpt-5-chat'), (14, 'gpt-5-mini'), (15, 'gpt-5-nano'), (16, 'gpt-image-1'), (17, 'o1'), (18, 'o1-mini'), (19, 'o3'), (20, 'o3-mini'), (21, 'o4-mini'), (22, 'text-embedding-3-large'), (23, 'text-embedding-3-small'), (24, 'text-embedding-ada-002')]


In [4]:
model_index = 4

mymodel = list_of_models[model_index][1]
print(mymodel)

gpt-4.1


In [5]:
my_gpt(mymodel,'How far is saturn from the sun?')


[{'role': 'user', 'content': 'How far is saturn from the sun?'},
 {'role': 'assistant',
  'content': 'Saturn is, on average, about **1.43 billion kilometers** (or **886 million miles**) away from the Sun. This distance is also expressed as **9.5 astronomical units (AU)**, where 1 AU is the average distance from the Earth to the Sun (about 150 million kilometers or 93 million miles).\n\nKeep in mind that Saturn’s distance from the Sun varies slightly because its orbit is not a perfect circle, but this average value is commonly used.'}]

In [44]:
list_of_questions = ['Where are kangaroos mostly found?', 'What countries are best for growing apples?', 'Who let the dogs out?']

[my_gpt(mymodel, question) for question in list_of_questions]

[[{'role': 'user', 'content': 'Where are kangaroos mostly found?'},
  {'role': 'assistant',
   'content': 'Kangaroos are **mostly found in Australia**. They are native to the Australian continent and are commonly seen in a variety of habitats, including grasslands, forests, savannas, and bushland. Some species of kangaroos can also be found in **Tasmania** and **New Guinea**. However, the vast majority of kangaroos live in Australia, where they are a well-known symbol of the country.'}],
 [{'role': 'user', 'content': 'What countries are best for growing apples?'},
  {'role': 'assistant',
   'content': 'The best countries for growing apples typically have **temperate climates** with cold winters and moderate summers, as apple trees require a period of winter chill to produce fruit. The following countries are renowned for their apple production, both in terms of quantity and quality:\n\n### 1. **China**\n- **World’s largest producer** (by far).\n- Major apple-growing regions: Shandong, 

# Fine-tuning Aure OpenAI

In [8]:
import os
import json

import tiktoken
import numpy as np
from collections import defaultdict

## Training and Validation data

### Training set

### Validation set

### Multi-turn chat 

## Check data

In [26]:
# Run preliminary checks

filepath ='/mnt/custom-file-systems/efs/fs-0252e317d4af1dc34_fsap-0a708b50be80889d5/CLONED_REPOS/LLM-World/Files'

# Load the training set
with open(os.path.join(filepath,'training_set.jsonl'), 'r', encoding='utf-8') as f:
    training_dataset = [json.loads(line) for line in f]

# Training dataset stats
print("Number of examples in training set:", len(training_dataset))
print("First example in training set:")

training_dataset[0]

Number of examples in training set: 73
First example in training set:


{'messages': [{'role': 'system',
   'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
  {'role': 'user', 'content': 'What is a vector space in linear algebra?'},
  {'role': 'assistant',
   'content': 'A vector space is a collection of vectors where you can add them together and multiply them by scalars, following specific rules.'}]}

In [8]:
for message in training_dataset[0]["messages"]:
    print(message)


{'role': 'system', 'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'}
{'role': 'user', 'content': 'What is a vector space in linear algebra?'}
{'role': 'assistant', 'content': 'A vector space is a collection of vectors where you can add them together and multiply them by scalars, following specific rules.'}


In [9]:
# Load the validation set

with open(os.path.join(filepath,'validation_set.jsonl'), 'r', encoding='utf-8') as f:
    validation_dataset = [json.loads(line) for line in f]

# Validation dataset stats
print("\nNumber of examples in validation set:", len(validation_dataset))
print("First example in validation set:")

validation_dataset[0]



Number of examples in validation set: 20
First example in validation set:


{'messages': [{'role': 'system',
   'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
  {'role': 'user',
   'content': 'What is the difference between matrix multiplication and element-wise multiplication?'},
  {'role': 'assistant',
   'content': 'Matrix multiplication involves dot products between rows and columns, while element-wise multiplication multiplies corresponding entries directly.'}]}

In [10]:
for message in validation_dataset[0]["messages"]:
    print(message)

{'role': 'system', 'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'}
{'role': 'user', 'content': 'What is the difference between matrix multiplication and element-wise multiplication?'}
{'role': 'assistant', 'content': 'Matrix multiplication involves dot products between rows and columns, while element-wise multiplication multiplies corresponding entries directly.'}


### Token Count

In [11]:
# Validate token counts

encoding = tiktoken.get_encoding("o200k_base") # default encoding for gpt-4o models. This requires the latest version of tiktoken to be installed.

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [13]:
files = ['training_set.jsonl', 'validation_set.jsonl']

for file in files:
    print(f"Processing file: {file}")
    with open(os.path.join(filepath,file), 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    total_tokens = []
    assistant_tokens = []

    for ex in dataset:
        messages = ex.get("messages", {})
        total_tokens.append(num_tokens_from_messages(messages))
        assistant_tokens.append(num_assistant_tokens_from_messages(messages))

    print_distribution(total_tokens, "total tokens")
    print_distribution(assistant_tokens, "assistant tokens")
    print('*' * 50)

Processing file: training_set.jsonl

#### Distribution of total tokens:
min / max: 49, 73
mean / median: 60.24657534246575, 60.0
p5 / p95: 53.0, 66.0

#### Distribution of assistant tokens:
min / max: 12, 34
mean / median: 22.356164383561644, 22.0
p5 / p95: 17.0, 28.0
**************************************************
Processing file: validation_set.jsonl

#### Distribution of total tokens:
min / max: 57, 71
mean / median: 63.85, 64.0
p5 / p95: 60.6, 66.4

#### Distribution of assistant tokens:
min / max: 18, 34
mean / median: 25.9, 26.5
p5 / p95: 20.9, 29.200000000000003
**************************************************


## Upload 

In [34]:
# Upload fine-tuning files

client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_BASEURL"),
  api_key = os.getenv("AZURE_OPENAI_KEY"),
  api_version = '2024-12-01-preview',
  # api_version = "2025-02-01-preview"  
)

training_file_name = 'training_set.jsonl'
validation_file_name = 'validation_set.jsonl'

In [36]:
#  Upload the training and validation dataset files to Azure OpenAI with the SDK.

training_response = client.files.create(
    file = open(os.path.join(filepath,training_file_name), "rb"), purpose="fine-tune"
)
training_file_id = training_response.id

validation_response = client.files.create(
    file = open(os.path.join(filepath,validation_file_name), "rb"), purpose="fine-tune"
)
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

NotFoundError: <html>
<head><title>404 Not Found</title></head>
<body>
<center><h1>404 Not Found</h1></center>
<hr><center>nginx</center>
</body>
</html>