# Fine-tuning with Snowflake Cortex to improve robustness and alignment of your feedback functions

[The Snowflake Cortex Fine-Tuning function](https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-finetuning) offers a way to customize large language models for your specific task. This notebook focuses on how to improve the evaluation capability of TruLens groundedness feedback function by fine-tuning LLMs available on Snowflake Cortex using benchmark datasets with human annotation (i.e. SummEval). 
 

In [None]:
from trulens.tests.test_cases import generate_summeval_groundedness_golden_set

# generator for groundedness golden set
test_cases_gen = generate_summeval_groundedness_golden_set(
    "../tests/datasets/summeval/summeval_test_100.json"
)

In [None]:
test_split, train_split, val_split = [], [], []


for i in range(500):
    test_split.append(next(test_cases_gen))

for i in range(500, 1300):
    train_split.append(next(test_cases_gen))

for i in range(1300, 1600):
    val_split.append(next(test_cases_gen))

In [None]:
train_split[:3]

In [None]:
import os

os.environ["SNOWFLAKE_ACCOUNT"] = "xxx-xxx"
os.environ["SNOWFLAKE_USER"] = "..."
os.environ["SNOWFLAKE_USER_PASSWORD"] = "..."
os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["SNOWFLAKE_DATABASE"] = "..."
os.environ["SNOWFLAKE_SCHEMA"] = "..."

# Extract current prompts we use for groundedness without COT for supervised fine-tuning datasets


In [None]:
from trulens.feedback import prompts

system_prompt: str = prompts.LLM_GROUNDEDNESS_SYSTEM

In [None]:
preprocessed_train_data, preprocessed_val_data = [], []
for row in train_split:
    source, hypothesis = row["query"], row["response"]
    user_prompt: str = """
        SOURCE: {source}
        
        Hypothesis: {hypothesis}
        Please answer with just the score. nothing else.
        """.format(source=f"{source}", hypothesis=f"{hypothesis}")
    llm_messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    full_prompt = str(llm_messages)

    human_score_scale_on_5 = row["human_score"]
    human_score_scale_on_10 = 10 * ((human_score_scale_on_5 - 1) / 4)

    preprocessed_train_data.append({
        "prompt": full_prompt,
        "completion": str(human_score_scale_on_10),
    })

for row in val_split:
    source, hypothesis = row["query"], row["response"]
    user_prompt: str = """
        SOURCE: {source}
        
        Hypothesis: {hypothesis}
        Please answer with just the score. nothing else.
        """.format(source=f"{source}", hypothesis=f"{hypothesis}")
    llm_messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    full_prompt = str(llm_messages)

    human_score_scale_on_5 = row["human_score"]
    human_score_scale_on_10 = 10 * ((human_score_scale_on_5 - 1) / 4)

    preprocessed_val_data.append({
        "prompt": full_prompt,
        "completion": str(human_score_scale_on_10),
    })

assert len(preprocessed_train_data) == len(train_split)
assert len(preprocessed_val_data) == len(val_split)

# Preparing dataset for Snowflake table - only 2 column names are `prompt` and `completion` are accepted by Cortex Finetuning function

In [None]:
import snowflake.connector

# Connect to Snowflake
conn = snowflake.connector.connect(
    user=os.environ["SNOWFLAKE_USER"],
    password=os.environ["SNOWFLAKE_USER_PASSWORD"],
    account=os.environ["SNOWFLAKE_ACCOUNT"],
)

# Create a cursor object
cur = conn.cursor()

cur.execute("USE DATABASE DHUANG_LLM")
# SQL command to create the table
create_table_sql_train = """
CREATE TABLE IF NOT EXISTS summeval_train_data (
    prompt STRING,
    completion STRING
);
"""
create_table_sql_val = """
CREATE TABLE IF NOT EXISTS summeval_val_data (
    prompt STRING,
    completion STRING
);
"""
# Execute the SQL command to create the table
cur.execute(create_table_sql_train)
cur.execute(create_table_sql_val)

# Data to be inserted

# Insert data into the table
insert_sql_train = (
    "INSERT INTO summeval_train_data (prompt, completion) VALUES (%s, %s)"
)
for row in preprocessed_train_data:
    cur.execute(insert_sql_train, (row["prompt"], row["completion"]))

insert_sql_val = (
    "INSERT INTO summeval_val_data (prompt, completion) VALUES (%s, %s)"
)
for row in preprocessed_val_data:
    cur.execute(insert_sql_val, (row["prompt"], row["completion"]))

# Commit the transaction
conn.commit()

# Close the cursor and connection
cur.close()
conn.close()

# Kick off Cortex FineTune Job in your Snowflake account to fine-tune `Mixtral-8x7b` (as a feedback function provider)

In [None]:
import snowflake.connector

# Connect to Snowflake
conn = snowflake.connector.connect(
    user=os.environ["SNOWFLAKE_USER"],
    password=os.environ["SNOWFLAKE_USER_PASSWORD"],
    account=os.environ["SNOWFLAKE_ACCOUNT"],
)

# Create a cursor object
cur = conn.cursor()

cur.execute("USE DATABASE DHUANG_LLM")

finetune_sql = """

SELECT SNOWFLAKE.CORTEX.FINETUNE(
  'CREATE',
  'ft_mixtral_8x7b_06152024',
  'mixtral-8x7b',
  'SELECT prompt, completion FROM summeval_train_data',
  'SELECT prompt, completion FROM summeval_val_data'
);
"""

cur.execute(finetune_sql)


# Commit the transaction
conn.commit()

# Close the cursor and connection
cur.close()
conn.close()

# Evaluate the base models to establish baseline performance

In [None]:
test_split[:3]

In [None]:
from trulens.apps.basic import TruBasicApp
from trulens.core import Feedback
from trulens.core import Select
from trulens.core import TruSession
from trulens.feedback import GroundTruthAgreement
from trulens.providers.cortex import Cortex

session = TruSession()
session.reset_database()

In [None]:
from snowflake.snowpark.session import Session

connection_params = {
    "account": os.environ["SNOWFLAKE_ACCOUNT"],
    "user": os.environ["SNOWFLAKE_USER"],
    "private_key_file": os.environ["SNOWFLAKE_PRIVATE_KEY_FILE"],
    "role": os.environ["SNOWFLAKE_ROLE"],
    "database": os.environ["SNOWFLAKE_DATABASE"],
    "schema": os.environ["SNOWFLAKE_SCHEMA"],
    "warehouse": os.environ["SNOWFLAKE_WAREHOUSE"],
}
snowpark_session = Session.builder.configs(connection_params).create()

In [None]:
cortex_provider_mixtral = Cortex(
    snowpark_session,
    model_engine="ft_mixtral_8x7b_06152024",
)

f_groundedness_mixtral = Feedback(
    cortex_provider_mixtral.groundedness_measure_with_cot_reasons,
    name="groundedness Mixtral",
)

In [None]:
text = (
    "(CNN)Donald Sterling's racist remarks cost him an NBA team last year. But now it's his former female companion who has lost big. A Los Angeles judge has ordered V. Stiviano to pay back more than $2.6 million in gifts after Sterling's wife sued her. In the lawsuit, Rochelle \"Shelly\" Sterling accused Stiviano of targeting extremely wealthy older men. She claimed Donald Sterling used the couple's money to buy Stiviano a Ferrari, two Bentleys and a Range Rover, and that he helped her get a $1.8 million duplex. Who is V. Stiviano? Stiviano countered that there was nothing wrong with Donald Sterling giving her gifts and that she never took advantage of the former Los Angeles Clippers owner, who made much of his fortune in real estate. Shelly Sterling was thrilled with the court decision Tuesday, her lawyer told CNN affiliate KABC. \"This is a victory for the Sterling family in recovering the $2,630,000 that Donald lavished on a conniving mistress,\" attorney Pierce O'Donnell said in a statement. \"It also sets a precedent that the injured spouse can recover damages from the recipient of these ill-begotten gifts.\" Stiviano's gifts from Donald Sterling didn't just include uber-expensive items like luxury cars. According to the Los Angeles Times, the list also includes a $391 Easter bunny costume, a $299 two-speed blender and a $12 lace thong. Donald Sterling's downfall came after an audio recording surfaced of the octogenarian arguing with Stiviano. In the tape, Sterling chastises Stiviano for posting pictures on social media of her posing with African-Americans, including basketball legend Magic Johnson. \"In your lousy f**ing Instagrams, you don't have to have yourself with -- walking with black people,\" Sterling said in the audio first posted by TMZ. He also tells Stiviano not to bring Johnson to Clippers games and not to post photos with the Hall of Famer so Sterling's friends can see. \"Admire him, bring him here, feed him, f**k him, but don't put (Magic) on an Instagram for the world to have to see so they have to call me,\" Sterling said. NBA Commissioner Adam Silver banned Sterling from the league, fined him $2.5 million and pushed through a charge to terminate all of his ownership rights in the franchise. Fact check: Donald Sterling's claims vs. reality CNN's Dottie Evans contributed to this report.",
)
summary = "donald sterling , nba team last year . sterling 's wife sued for $ 2.6 million in gifts . sterling says he is the former female companion who has lost the . sterling has ordered v. stiviano to pay back $ 2.6 m in gifts after his wife sued . sterling also includes a $ 391 easter bunny costume , $ 299 and a $ 299 ."

In [None]:
f_groundedness_mixtral(text, summary)

In [None]:
cortex_provider_mixtral = Cortex(
    snowpark_session,
    model_engine="ft_mixtral_8x7b_06152024",
)

f_groundedness_mixtral = Feedback(
    cortex_provider_mixtral.groundedness_measure_with_cot_reasons,
    name="groundedness Mixtral",
)


def wrapped_groundedness_mixtral(input, output) -> float:
    score = f_groundedness_mixtral(input, output)[0]
    return score


ground_truth = GroundTruthAgreement(
    test_split, provider=cortex_provider_mixtral
)
# Call the numeric_difference method with app and record and aggregate to get the mean absolute error
f_mae = (
    Feedback(ground_truth.absolute_error, name="Mean Absolute Error")
    .on(Select.Record.calls[0].args.args[0])
    .on(Select.Record.calls[0].args.args[1])
    .on_output()
)


tru_wrapped_groundedness_mixtral = TruBasicApp(
    wrapped_groundedness_mixtral,
    app_name="groundedness",
    app_version="Mixtral-8x7b baseline",
    feedbacks=[f_mae],
)

for i in range(len(test_split)):
    source = test_split[i]["query"]
    response = test_split[i]["response"]

    with tru_wrapped_groundedness_mixtral as recording:
        try:
            tru_wrapped_groundedness_mixtral.app(source, response)
        except Exception as e:
            print(e)

In [None]:
cortex_provider_mistral_7b = Cortex(
    snowpark_session,
    model_engine="mistral-7b",
)

f_groundedness_mistral_7b = Feedback(
    cortex_provider_mistral_7b.groundedness_measure_with_cot_reasons,
    name="groundedness Mistral-7b",
)


def wrapped_groundedness_mistral_7b(input, output) -> float:
    score = f_groundedness_mistral_7b(input, output)[0]
    return score


ground_truth = GroundTruthAgreement(
    test_split, provider=cortex_provider_mistral_7b
)
# Call the numeric_difference method with app and record and aggregate to get the mean absolute error
f_mae = (
    Feedback(ground_truth.absolute_error, name="Mean Absolute Error")
    .on(Select.Record.calls[0].args.args[0])
    .on(Select.Record.calls[0].args.args[1])
    .on_output()
)


tru_wrapped_groundedness_mistral_7b = TruBasicApp(
    wrapped_groundedness_mistral_7b,
    app_name="groundedness",
    app_version="Mistral-7b baseline",
    feedbacks=[f_mae],
)

for i in range(len(test_split)):
    source = test_split[i]["query"]
    response = test_split[i]["response"]

    with tru_wrapped_groundedness_mistral_7b as recording:
        try:
            tru_wrapped_groundedness_mistral_7b.app(source, response)
        except Exception as e:
            print(e)

# Finetune `Mistral 7b`, in additional to `Mixtral-8x7b` for side-by-side comparison

In [None]:
import snowflake.connector

# Connect to Snowflake
conn = snowflake.connector.connect(
    user=os.environ["SNOWFLAKE_USER"],
    password=os.environ["SNOWFLAKE_USER_PASSWORD"],
    account=os.environ["SNOWFLAKE_ACCOUNT"],
)

# Create a cursor object
cur = conn.cursor()

cur.execute("USE DATABASE DHUANG_LLM")

finetune_sql = """
SELECT SNOWFLAKE.CORTEX.FINETUNE(
  'CREATE',
  'ft_mistral_7b_06182024',
  'mistral-7b',
  'SELECT prompt, completion FROM summeval_train_data',
  'SELECT prompt, completion FROM summeval_val_data'
);
"""

cur.execute(finetune_sql)


# Commit the transaction
conn.commit()

# Close the cursor and connection
cur.close()
conn.close()

In [None]:
ground_truth = GroundTruthAgreement(
    test_split, provider=cortex_provider_mistral_7b
)
# Call the numeric_difference method with app and record and aggregate to get the mean absolute error
f_mae = (
    Feedback(ground_truth.absolute_error, name="Mean Absolute Error")
    .on(Select.Record.calls[0].args.args[0])
    .on(Select.Record.calls[0].args.args[1])
    .on_output()
)

# Evaluate Groundedness performance after finetuning for both `Mixtral-8x7b` and `Mistral-7b`

In [None]:
cortex_provider_mixtral_ft = Cortex(
    snowpark_session,
    model_engine="ft_mixtral_8x7b_06152024",
)

f_groundedness_mixtral_ft = Feedback(
    cortex_provider_mixtral_ft.groundedness_measure_with_cot_reasons,
    name="groundedness Mixtral finetuned",
)


def wrapped_groundedness_mixtral_finetuned(input, output) -> float:
    score = f_groundedness_mixtral_ft(input, output)[0]
    return score


tru_wrapped_groundedness_mixtral_ft = TruBasicApp(
    wrapped_groundedness_mixtral_finetuned,
    app_name="groundedness",
    app_version="Mixtral-8x7b finetuned",
    feedbacks=[f_mae],
)

for i in range(len(test_split)):
    source = test_split[i]["query"]
    response = test_split[i]["response"]

    with tru_wrapped_groundedness_mixtral_ft as recording:
        try:
            tru_wrapped_groundedness_mixtral_ft.app(source, response)
        except Exception as e:
            print(e)

In [None]:
cortex_provider_mistral_7b_ft = Cortex(
    snowpark_session,
    model_engine="ft_mistral_7b_06182024",
)

f_groundedness_mistral_7b = Feedback(
    cortex_provider_mistral_7b_ft.groundedness_measure_with_cot_reasons,
    name="groundedness Mistral-7b finetuned",
)


def wrapped_groundedness_mistral_7b_finetuned(input, output) -> float:
    score = f_groundedness_mistral_7b(input, output)[0]
    return score


tru_wrapped_groundedness_mistral_7b_ft = TruBasicApp(
    wrapped_groundedness_mistral_7b_finetuned,
    app_name="groundedness",
    app_version="Mistral-7b finetuned",
    feedbacks=[f_mae],
)

for i in range(len(test_split)):
    source = test_split[i]["query"]
    response = test_split[i]["response"]

    with tru_wrapped_groundedness_mistral_7b_ft as recording:
        try:
            tru_wrapped_groundedness_mistral_7b_ft.app(source, response)
        except Exception as e:
            print(e)

In [None]:
session.get_leaderboard()

### Preliminary analysis:


We observe a slight improvements over baseline with Mixtral-8x7b model as our feedback provider. 

Notice in the Mistral-7b case, finetuning on a limited set of training data (800 input-output pairs) actually results in worse performance. This could be attributed to the model overfitting the train set and thus failing to generalize well on the test set. 

In another test run (not shown in this notebook), we also see potentially under-training on Llama3-8B, as the train loss recorded was about 20x of the loss values we got here for Mixtral-7b.

Both of the above scenarios are expected with the current beta (private preview) version of Cortex Fine-Tuning API, and we believe better results can be achieved in the near releases.  



### Sanity check: some Statistical Exploration of the SummEval dataset

In [None]:
avg_score_test, avg_score_val, avg_score_train = 0, 0, 0
test_human_scores, val_human_scores, train_human_scores = [], [], []
for row in test_split:
    avg_score_test += row["human_score"]
    test_human_scores.append(row["human_score"])
print(avg_score_test)
avg_score_test = avg_score_test / len(test_split)

for row in val_split:
    avg_score_val += row["human_score"]
    val_human_scores.append(row["human_score"])

print(avg_score_val)

avg_score_val = avg_score_val / len(val_split)

for row in train_split:
    avg_score_train += row["human_score"]
    train_human_scores.append(row["human_score"])

print(avg_score_train)
avg_score_train = avg_score_train / len(train_split)


print(avg_score_train, avg_score_test, avg_score_val)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

plt.hist(train_human_scores, bins=5, alpha=0.5, label="Train", color="b")
plt.hist(val_human_scores, bins=5, alpha=0.5, label="Validation", color="g")
plt.hist(test_human_scores, bins=5, alpha=0.5, label="Test", color="r")

plt.xlabel("Scores")
plt.ylabel("Frequency")
plt.title("Distribution of Human Scores")
plt.legend(loc="upper right")
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data for boxplot
data = [train_human_scores, val_human_scores, test_human_scores]
labels = ["Train", "Validation", "Test"]

plt.figure(figsize=(12, 6))

# Create boxplot
box = plt.boxplot(
    data, labels=labels, patch_artist=True, showmeans=True, notch=True
)

# Customize boxplot
colors = ["skyblue", "lightgreen", "lightcoral"]
for patch, color in zip(box["boxes"], colors):
    patch.set_facecolor(color)

# Add grid
plt.grid(True, linestyle="--", alpha=0.7)

# Add jittered points for better visualization of distributions
for i in range(len(data)):
    y = data[i]
    x = np.random.normal(i + 1, 0.04, size=len(y))
    plt.plot(x, y, "r.", alpha=0.6)

# Axis labels and title
plt.xlabel("Dataset")
plt.ylabel("Scores")
plt.title("Box Plot of Human Scores (1 to 5)")

# Display the plot
plt.show()