In [1]:
import pandas as pd
import json
import openai
import os
import tiktoken
import numpy as np
from collections import defaultdict
import time


openai.api_key = "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"

def generate_json_from_excel(model):

    # Read the .xlsx file, ensuring 'N/A' is treated as a string
    data = pd.read_excel("Model "+model+".xlsx", na_values=[], keep_default_na=False)

    # Define the path for the output .jsonl file
    output_path = "Model "+model+"_json.jsonl"

    # Open the file in write mode
    with open(output_path, 'w') as file:
        # Iterate over each row in the dataframe
        for index, row in data.iterrows():
            # Create the JSON object for each row
            json_obj = {
                "messages": [
                    {"role": "system", "content": row["system"]},
                    {"role": "user", "content": row["user"]},
                    {"role": "assistant", "content": row["assistant"]}
                ]
            }
            # Write the JSON object to the file
            file.write(json.dumps(json_obj) + '\n')

    print(f"Data of Model {model} has been successfully converted and saved to {output_path}")
    return output_path

def check_json(model):
    #function provide by OpenAI
    #we specify the data path and open the JSONL file

    data_path = "Model "+model+"_json.jsonl"

    # Load dataset
    with open(data_path) as f:
        dataset = [json.loads(line) for line in f]

    # We can inspect the data quickly by checking the number of examples and the first item

    # Initial dataset stats
    print("Num examples:", len(dataset))
    print("First example:")
    for message in dataset[0]["messages"]:
        print(message)

    # Now that we have a sense of the data, we need to go through all the different examples and check to make sure the formatting is correct and matches the Chat completions message structure

    # Format error checks
    format_errors = defaultdict(int)

    for ex in dataset:
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue

        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue

        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1

            if any(k not in ("role", "content", "name") for k in message):
                format_errors["message_unrecognized_key"] += 1

            if message.get("role", None) not in ("system", "user", "assistant"):
                format_errors["unrecognized_role"] += 1

            content = message.get("content", None)
            if not content or not isinstance(content, str):
                format_errors["missing_content"] += 1

        if not any(message.get("role", None) == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

    if format_errors:
        print("Found errors:")
        for k, v in format_errors.items():
            print(f"{k}: {v}")
    else:
        num_error=0
        print("No errors found")

    # Beyond the structure of the message, we also need to ensure that the length does not exceed the 4096 token limit.

    # Token counting functions
    encoding = tiktoken.get_encoding("cl100k_base")

    # not exact!
    # simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
    def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
        num_tokens = 0
        for message in messages:
            num_tokens += tokens_per_message
            for key, value in message.items():
                if not isinstance(value, str):
                    print(f"Error in message: {message}")
                    print(f"Invalid value for key '{key}': {value} (type: {type(value)})")
                    continue
                num_tokens += len(encoding.encode(value))
                if key == "name":
                    num_tokens += tokens_per_name
        num_tokens += 3
        return num_tokens

    def num_assistant_tokens_from_messages(messages):
        num_tokens = 0
        for message in messages:
            if message["role"] == "assistant":
                num_tokens += len(encoding.encode(message["content"]))
        return num_tokens

    def print_distribution(values, name):
        print(f"\n#### Distribution of {name}:")
        print(f"min / max: {min(values)}, {max(values)}")
        print(f"mean / median: {np.mean(values)}, {np.median(values)}")
        print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

    # Last, we can look at the results of the different formatting operations before proceeding with creating a fine-tuning job:

    # Warnings and tokens counts
    n_missing_system = 0
    n_missing_user = 0
    n_messages = []
    convo_lens = []
    assistant_message_lens = []

    for ex in dataset:
        messages = ex["messages"]
        if not any(message["role"] == "system" for message in messages):
            n_missing_system += 1
        if not any(message["role"] == "user" for message in messages):
            n_missing_user += 1
        n_messages.append(len(messages))
        convo_lens.append(num_tokens_from_messages(messages))
        assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

    print("Num examples missing system message:", n_missing_system)
    print("Num examples missing user message:", n_missing_user)
    print_distribution(n_messages, "num_messages_per_example")
    print_distribution(convo_lens, "num_total_tokens_per_example")
    print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
    n_too_long = sum(l > 4096 for l in convo_lens)
    print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

    # Pricing and default n_epochs estimate
    MAX_TOKENS_PER_EXAMPLE = 4096

    MIN_TARGET_EXAMPLES = 100
    MAX_TARGET_EXAMPLES = 25000
    TARGET_EPOCHS = 3
    MIN_EPOCHS = 1
    MAX_EPOCHS = 25

    n_epochs = TARGET_EPOCHS
    n_train_examples = len(dataset)
    if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
        n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
    elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
        n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

    n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
    print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
    print(f"By default, you'll train for {n_epochs} epochs on this dataset")
    print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")    
    
    return num_error 
    
def start_ft(model):
    new_upload = openai.File.create(
      file=open( "Model "+model+"_json.jsonl", "rb"),
      purpose='fine-tune'
    )
    print("\n")
    print(f"Data of Model {model} has been uploaded. Please wait for 2 minutes to start the job.")
    print(new_upload)
    time.sleep(120)


    for _ in range(100):  # Max attempts
        print(f"Try to create fine-tuning job for Model {model}. The job ID is {new_upload.id}")
        try:
            ft_model = openai.FineTuningJob.create(training_file=new_upload.id, model="gpt-3.5-turbo")
            print("Fine-tuning job created successfully! Please check the email for the model ID.")
            print(ft_model)
            print("\n")
            break
        except openai.error.InvalidRequestError as e:
            if "still being processed" in str(e):
                print("File is still being processed. Retrying in 30 seconds...")
                time.sleep(30)  # Wait for 30 seconds
            else:
                raise e
        except openai.error.RateLimitError as e:
            if "rate-limited" in str(e):
                if "per day"in str(e):
                    print("12 task per day limit reached.")
                    time.sleep(10000)  # Wait for 90 minutes
                print("Rate limit reached. Retrying in 10 minutes...")
                time.sleep(600)  # Wait for 10 minutes
            else:
                raise e
      
    else:
        print("Max attempts reached. Fine-tuning job could not be created.")
    return new_upload.id

In [None]:
method_type = "S"

for i in range(1, 7):  # Loop from Model 1 to Model 6
    model = str(i) + method_type
    
    print("\n++++++++++++++++++++++++++++++++++++++++++++")
    print(f"Start to process Model {model}.\n")
    start_time = time.time()  # Start timing
    
    generate_json_from_excel(model)
    check_json(model)
    start_ft(model)
    
    end_time = time.time()  # End timing
    elapsed_time = end_time - start_time  # Calculate elapsed time
    
    print(f"Finish Model {model}.\n")
    print(f"Model {model} took {elapsed_time:.2f} seconds to process.")
    print("\n++++++++++++++++++++++++++++++++++++++++++++")



++++++++++++++++++++++++++++++++++++++++++++
Start to process Model 1S.

Data of Model 1S has been successfully converted and saved to Model 1S_json.jsonl
Num examples: 1990
First example:
{'role': 'system', 'content': "You are an AI assistant with expertise in organic chemistry. Your task is to make theoretical modifications to a given SMILES code of a MOF linker.  Your objective is introduce new functional groups or alter existing ones to the linker, then provide the correct molecular representation for the modified linker. You should never remove or modify the carboxylate groups, as they are essential to MOF linkers. The user can choose from 5 mutation actions:\n\n(1) Introduce or remove a methyl group from the ring.\n(2) Introduce or remove a hydroxyl group from the ring.\n(3) Introduce or remove an amino group from the ring.\n(4) Introduce or remove a nitro group from the ring.\n(5) Introduce or remove a fluoro group to the ring.\n\nThe user will first specify the desired mutatio

In [59]:
method_type = "R"

for i in range(3, 4):  # Loop from Model 3 ONLY
    model = str(i) + method_type
    
    print("\n++++++++++++++++++++++++++++++++++++++++++++")
    print(f"Start to process Model {model}.\n")
    start_time = time.time()  # Start timing
    
    generate_json_from_excel(model)
    check_json(model)
    start_ft(model)
    
    end_time = time.time()  # End timing
    elapsed_time = end_time - start_time  # Calculate elapsed time
    
    print(f"Finish Model {model}.\n")
    print(f"Model {model} took {elapsed_time:.2f} seconds to process.")
    print("\n++++++++++++++++++++++++++++++++++++++++++++")



++++++++++++++++++++++++++++++++++++++++++++
Start to process Model 3R.

Data of Model 3R has been successfully converted and saved to Model 3R_json.jsonl
Num examples: 700
First example:
{'role': 'system', 'content': "You are an AI assistant with expertise in organic chemistry. Your task is to make theoretical \nmodifications to a given IUPAC name of a MOF linker.  Your objective is to swap out atoms in the linker with \ndifferent heteroatoms (e.g., replace a carbon atom with a nitrogen or sulfur atom), while adhering to general \nchemical rules and bonding constraints, such as ensuring ring stability and proper valence for atoms, then provide \nthe correct IUPAC name for the modified linker. The user can choose from three mutation actions:\n\n(1) Replace a carbon atom in the ring with nitrogen, or vice versa.\n(2) Replace a carbon atom in the ring with oxygen, or vice versa.\n(3) Replace a carbon atom in the ring with sulfur, or vice versa.\n\nThe user will first specify the desired

In [57]:
# List 10 fine-tuning jobs
openai.FineTuningJob.list(limit=30)

<OpenAIObject list at 0x1a60209aef0> JSON: {
  "object": "list",
  "data": [
    {
      "object": "fine_tuning.job",
      "id": "ftjob-mx1mi1Xy1dSFNzhG6a6A6bwG",
      "model": "gpt-3.5-turbo-0613",
      "created_at": 1693988356,
      "finished_at": 1693993551,
      "fine_tuned_model": "ft:gpt-3.5-turbo-0613:uc-berkeley::7vjOau5j",
      "organization_id": "org-vVj7NX8f155Kazao4Phi3tol",
      "result_files": [
        "file-rrEKTbx2eiy9NEyNRmNSjBdA"
      ],
      "status": "succeeded",
      "validation_file": null,
      "training_file": "file-cjNCYYuUURIuVBw10TPrXSiI",
      "hyperparameters": {
        "n_epochs": 3
      },
      "trained_tokens": 1081977
    },
    {
      "object": "fine_tuning.job",
      "id": "ftjob-hSWlQC3UhhVv6LQRKRMclxE8",
      "model": "gpt-3.5-turbo-0613",
      "created_at": 1693982832,
      "finished_at": 1693988347,
      "fine_tuned_model": "ft:gpt-3.5-turbo-0613:uc-berkeley::7vi2eO10",
      "organization_id": "org-vVj7NX8f155Kazao4Phi3tol",

In [56]:
# Delete a fine-tuned model (must be an owner of the org the model was created in)
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vYwZXQf")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vYDMKKK")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vPwCDst")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vOWVVCU")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vNWFy9Q")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vMmBvdX")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vM756fz")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vL9nOL4")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vK1NyUQ")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vIoVeMj")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vGzFO3v")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vFyS3MA")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vFRIMeG")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7vEmRcxS")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7v0rb5G8")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7uzrXrgV")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7uzHFjrG")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7uyb0VuL")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7uxOgd8G")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7uUeknd5")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7uToiFyg")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7uT4Fjh7")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7uSIqs0O")
#openai.Model.delete("ft:gpt-3.5-turbo-0613:uc-berkeley::7uQcRdX5")



In [29]:
user_messages = [
    "THIOPHENE-2,5-DICARBOXYLIC ACID",
    "thiophene-2,4-dicarboxylic acid",
    "ethyl 5-formylthiophene-3-carboxylate",
    "5-(5-carboxyfuran-2-yl)furan-2-carboxylic acid",
    "benzene"
]

system_message = """ You are an AI assistant with expertise in organic chemistry. Your task is to make theoretical modifications to a given IUPAC name of a MOF linker. Your objective is to change the position of coordination sites, such as COOH, within aromatic or non-aromatic rings including 5-membered, 6-membered, 7-membered, and fused rings. Note that the changes can not result in invaild structures or violating any rules in chemical bonding. If no valid structures can be generated by the action or the input structure does not meet the requirement to perform the action, please answer with "N/A". The user will give you the IUPAC name of the linker to be modified.

You should first shift the position of COOH within any ring type to another position on the same ring, and then answer with the IUPAC name of the resulting compound.  """

for user_message in user_messages:
    completion = openai.ChatCompletion.create(
        model="ft:gpt-3.5-turbo-0613:uc-berkeley::7u4Ni1Wl",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ]
    )
    print(completion.choices[0].message)


{
  "role": "assistant",
  "content": "THIOPHENE-2,4-DICARBOXYLIC ACID"
}
{
  "role": "assistant",
  "content": "thiophene-2,3-dicarboxylic acid"
}
{
  "role": "assistant",
  "content": "N/A"
}
{
  "role": "assistant",
  "content": "4-(5-carboxyfuran-2-yl)furan-2-carboxylic acid"
}
{
  "role": "assistant",
  "content": "N/A"
}


In [None]:
#test model 1R
import openai
import os



user_messages = [
    "OC(=O)COCCOCCOCCOCCOCC(O)=O",
    "OC(=O)c1cccc2c(cccc12)C(O)=O",
    "OC(=O)Cc1ccc(cc1)c2ccc(CC(O)=O)cc2",
    "Cc1[nH]nc(C)c1c2cc(cc(c2)C(O)=O)C(O)=O",
    "OC(=O)c1cc(F)cc(c1)C(O)=O"
]


"""You are an AI assistant with expertise in organic chemistry. Your task is to make theoretical 
modifications to a given SMILES code of a MOF linker.  Your objective is to swap out atoms in the linker with 
different heteroatoms (e.g., replace a carbon atom with a nitrogen or sulfur atom), while adhering to general 
chemical rules and bonding constraints, such as ensuring ring stability and proper valence for atoms, then provide 
the correct SMILES code for the modified linker. The user can choose from three mutation actions:

(1) Replace a carbon atom in the ring with nitrogen, or vice versa.
(2) Replace a carbon atom in the ring with oxygen, or vice versa.
(3) Replace a carbon atom in the ring with sulfur, or vice versa.

The user will first specify the desired mutation action, followed by 'Action: '. In the next line, the user will 
provide the SMILES code of the MOF linker to be mutated, starting with 'Compound: '.

Your response should begin with 'New Compound: ', followed by the updated SMILES code. If the requested mutation 
isn't chemically feasible, due to bonding constraints or if the given structure isn't compatible with the mutation 
(e.g., it lacks a ring or a suitable substitution site), you should respond with 'New Compound: Invalid'."""

for user_message in user_messages:
    completion = openai.ChatCompletion.create(
        model="ft:gpt-3.5-turbo-0613:uc-berkeley::7u4Ni1Wl",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ]
    )
    print(completion.choices[0].message)