<a href="https://colab.research.google.com/github/withpi/cookbook-withpi/blob/main/colabs/Blog_Post_Writer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Blog Post Writer
This colab showcases how Pi can help build a blog post writer in the tone and style of an existing blog. For this demonstration we are using machinelearningmastery.com blogs as inspiration. A condensed version of these blogs were scraped and loaded into Hugging Face for this colab at:

  [withpi/machinelearningmastery_com_blogs_condensed](https://https://huggingface.co/datasets/withpi/machinelearningmastery_com_blogs_condensed)

Here is the overall flow of the colab:

1.   We will first create a Pi scoring system for the blog post writer
2.   Then we will evaluate a prompted model against this contract
3.   We will then fine tune a model, by picking high quality blogs from the hugging face dataset above using Pi

4.   Finally we will use Pi scoring system to evaluate the fine tuned model against the prompted model to see if we observe any improvement




# Install packages and utility functions
Here we are installing the Pi SDK, and we're also importing a few additonal things to help out this use case including a dataset utility as well as functions to help us more legibly print scores and Side by Side comparisons

In [None]:
# @title Install necessary packages
%%capture
%pip install withpi
%pip install datasets
%pip install litellm
%pip install httpx jinja2 tqdm

In [None]:
# @title Import utility functions for Pi SDK
%%capture

import os
from google.colab import files, userdata

# Load the notebook secret into the environment so the Pi Client can access it.
os.environ["WITHPI_API_KEY"] = userdata.get('WITHPI_API_KEY')

# Import a bunch of useful libraries for later.
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict
import json
from pathlib import Path
import re

import datasets
import httpx
import litellm
import jinja2
from tqdm.notebook import tqdm
from withpi import PiClient
from withpi.types import Contract
from IPython.display import display
import pandas as pd

client = PiClient()


def print_contract(contract: Contract):
    """print_contract pretty-prints a contract"""
    for dimension in contract.dimensions:
        print(dimension.label)
        for sub_dimension in dimension.sub_dimensions:
            print(f"\t{sub_dimension.description}")


def generate(system: str, user: str, model: str) -> str:
    """generate passes the provided system and user prompts into the given model
    via LiteLLM"""
    messages = [
        {"content": system, "role": "system"},
        {"content": user, "role": "user"},
    ]
    return litellm.completion(model=model, messages=messages).choices[0].message.content


class printer(str):
    """printer makes strings with embedded newlines print more nicely"""

    def __repr__(self):
        return self


def print_response(response: str):
    """print_response pretty-prints an LLM response, respecting newlines"""
    display(printer(response))


def print_scores(pi_scores):
    """print_scores pretty-prints a Pi Score response as a table."""
    for dimension_name, dimension_scores in pi_scores.dimension_scores.items():
        print(f"{dimension_name}: {dimension_scores.total_score}")
        for (
            subdimension_name,
            subdimension_score,
        ) in dimension_scores.subdimension_scores.items():
            print(f"\t{subdimension_name}: {subdimension_score}")
        print("\n")
    print("---------------------")
    print(f"Total score: {pi_scores.total_score}")


def save_file(filename: str, model: str):
    """save_file offers to download the model with the given filename"""
    Path(filename).write_text(model)
    files.download(filename)


def load_contract(url: str) -> Contract:
    """load_contract pulls a Contract JSON blob locally with validation."""
    resp = httpx.get(url)
    return Contract.model_validate_json(resp.content)


def load_and_split_dataset(url: str) -> datasets.DatasetDict:
    """load_and_split_dataset pulls in the Parquet file at url and does a 90/10 split"""
    return datasets.load_dataset(
        "parquet", data_files=url, split="train"
    ).train_test_split(test_size=0.1)


def do_bulk_inference(dataset, system, model):
    """do_bulk_inference performs inference on the 'input' column of dataset, using
    the provided system prompt.  The model identified will be used via LiteLLM"""

    def do_generate(user, pbar):
        result = generate(system, user, model)
        pbar.update(1)
        return result

    futures = []
    pbar = tqdm(total=len(dataset))
    with ThreadPoolExecutor(max_workers=4) as executor:
        for row in dataset:
            futures.append(executor.submit(do_generate, row["input"], pbar))
    return [future.result() for future in futures]


def do_bulk_templated_inference(dataset, optimized, model):
    """do_bulk_templated_inference performs inference on the 'input' column of dataset,
    using the provided optimized prompt.  It should be a Jinja2 template as returned
    by DSPy"""
    prompt_template = jinja2.Template(optimized)
    result_extractor = re.compile(
        r".*\[\[ ## response ## \]\](.*)\[\[ ## completed ## \]\]", re.DOTALL
    )

    def do_generate(prompt: str, pbar) -> str:
        messages = json.loads(prompt_template.render(input=prompt))
        result = (
            litellm.completion(model=model, messages=messages)
            .choices[0]
            .message.content
        )

        pbar.update(1)
        return result_extractor.match(result).group(1)

    futures = []
    pbar = tqdm(total=len(dataset))
    with ThreadPoolExecutor(max_workers=4) as executor:
        for row in dataset:
            futures.append(executor.submit(do_generate, row["input"], pbar))
    return [future.result() for future in futures]


def generate_table(
    job_id: str, training_data: dict, is_done: bool, additional_columns: dict[str, str]
):
    """Generate a training progress table dynamically."""
    data_dict = {}
    for header in ["Step", "Epoch", "Learning_Rate", "Training_Loss", "Eval_Loss"]:
        data_dict[header] = []
    for header in additional_columns.keys():
        data_dict[header] = []

    for step, data in training_data.items():
        data_dict["Step"].append(step)
        for header, key in [
            ("Epoch", "epoch"),
            ("Learning_Rate", "learning_rate"),
            ("Training_Loss", "loss"),
            ("Eval_Loss", "eval_loss"),
        ]:
            data_dict[header].append(data.get(key, "X"))
        for header, key in additional_columns.items():
            data_dict[header].append(data.get(key, "X"))

    if not is_done:
        data_dict["Step"].append("...")
        for header in ["Epoch", "Learning_Rate", "Training_Loss", "Eval_Loss"]:
            data_dict[header].append("")
        for header in additional_columns.keys():
            data_dict[header].append("")

    return pd.DataFrame(data_dict)


def stream_response(job_id: str, method, additional_columns: dict[str, str]):
    """stream_response streams messages from the provided method

    method should be a Pi client object with `retrieve` and `stream_messages`
    endpoints.  This is primarily for convenience."""

    print(f"Training Status for {job_id}")

    training_data = defaultdict(dict)
    is_log_console = False

    stream_output = display(
        generate_table(
            job_id, training_data, is_done=False, additional_columns=additional_columns
        ),
        display_id=True,
    )

    while True:
        response = method.retrieve(job_id=job_id)
        if (response.state != "QUEUED") and (response.state != "RUNNING"):
            if response.state == "DONE" and not is_log_console:
                for line in response.detailed_status:
                    try:
                        data_dict = json.loads(line)
                        training_data[data_dict["step"]].update(data_dict)
                    except Exception:
                        pass
                stream_output.update(
                    generate_table(
                        job_id,
                        training_data,
                        is_done=True,
                        additional_columns=additional_columns,
                    )
                )
            return response

        with method.with_streaming_response.stream_messages(
            job_id=job_id, timeout=None
        ) as response:
            is_done = False
            for line in response.iter_lines():
                if line == "DONE":
                    is_done = True
                try:
                    data_dict = json.loads(line)
                    training_data[data_dict["step"]].update(data_dict)
                except Exception:
                    pass
                stream_output.update(
                    generate_table(
                        job_id,
                        training_data,
                        is_done,
                        additional_columns=additional_columns,
                    )
                )
                is_log_console = True


In [None]:
# @title Import a utility function to pretty print Pi scores
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

def score_to_color(score):
    score = np.clip(score, 0, 1)  # Ensure score is within [0, 1]

    # Define the key color points
    colors = [
        (0.0, "#e74c3c"),  # Red
        (0.3, "#e67e22"),  # Orange
        (0.5, "#f1c40f"),  # Yellow
        (0.7, "#2ecc71"),  # Green-ish
        (1.0, "#27ae60")   # Bright Green
    ]

    # Create a colormap
    cmap = LinearSegmentedColormap.from_list("custom_colormap", [c[1] for c in colors], N=256)

    # Normalize score to the colormap range
    rgba = cmap(score)

    # Convert RGBA to HEX
    return '#{:02x}{:02x}{:02x}'.format(int(rgba[0]*255), int(rgba[1]*255), int(rgba[2]*255))

def print_scores(pi_scores):
  score_html = """
  <style>
  table {
    border-collapse: collapse; /* Ensures borders don't double up */
    width: 100%; /* Optional: makes the table full width */
  }

  tr {
    border-bottom: 1px solid #ccc; /* Sets a bottom border for each row */
    border-top: 1px solid #ccc; /* Sets a bottom border for each row */
  }

  th, td {
    font-weight: bold;
    padding: 4px; /* Adds some spacing */
    text-align: left; /* Aligns text to the left */
    border-right: 1px solid #ccc; /* Sets a bottom border for each row */
    border-left: 1px solid #ccc; /* Sets a bottom border for each row */
  }
  img {
    width: 30%;
  }
  </style>
  <table>"""

  for dimension_name, dimension_scores in pi_scores.dimension_scores.items():
    score_html += f"<tr><td><b>{dimension_name}</b></td><td></td><td style='color: {score_to_color(dimension_scores.total_score)}'>{round(dimension_scores.total_score, 3)}</td></tr>" + "\n"
    for subdimension_name, subdimension_score in dimension_scores.subdimension_scores.items():
      score_html += f"<tr><td></td><td style='font-weight: normal;'>{subdimension_name}</td><td style='color: {score_to_color(subdimension_score)}'>{round(subdimension_score, 3)}</td></tr>" + "\n"
    score_html += "\n\n"
  score_html += "<tr></tr>" + "\n"
  score_html += f"<tr><td>Total score</td><td></td><td style='color: {score_to_color(pi_scores.total_score)}'><b>{round(pi_scores.total_score, 3)}</b></td></tr>" + "\n"
  score_html += "</table>"
  return score_html

In [None]:
# @title Import a utility function to pretty print side by sides with Pi scores
from IPython.core.display import display, HTML
import markdown


def pretty_print_responses(response1, response2=None, header=None, left_label="Base", right_label="Test", scores_left=None, scores_right=None, debug_left=None, debug_right=None):
    md1 = markdown.markdown(response1)
    if response2:
      md2 = markdown.markdown(response2)
    else:
      md2 = markdown.markdown("")

    #print(md2)

    if scores_left:
      scores_left = print_scores(scores_left)
    if scores_right:
      scores_right = print_scores(scores_right)

    if header:
      header = markdown.markdown(header)
      html = f"""
      <div style="display: flex; gap: 40px;">
          <div style="width: 80%; padding: 30px; border: 1px solid #ddd; background-color: #fff9f5;">
              <h4>{header}</h4>
          </div>
      </div>"""
    else:
      html = ""

    html += f"""
    <div style="display: flex; gap: 20px;">
        <div style="width: 40%; padding: 10px; border: 1px solid #ddd; background-color: #f0f0f0; text-align:center;">
            <h4>{left_label}</h4>
        </div>
        <div style="width: 40%; padding: 10px; border: 1px solid #ddd; background-color: #f0f0f0; text-align:center;">
            <h4>{right_label}</h4>
        </div>
    </div>
    <div style="display: flex; gap: 20px;">
        <div style="width: 40%; padding: 10px; border: 1px solid #ddd;">
            {md1}
        </div>
        <div style="width: 40%; padding: 10px; border: 1px solid #ddd;">
            {md2}
        </div>
    </div>
    """
    if scores_left or scores_right:
      html += f"""
        <div style="display: flex; gap: 20px;">
            <div style="width: 40%; padding: 10px; border: 1px solid #ddd;  background-color: #f2f1fe;">
                {scores_left or ""}
            </div>
            <div style="width: 40%; padding: 10px; border: 1px solid #ddd;  background-color: #f2f1fe;">
                {scores_right or ""}
            </div>
        </div>"""
    if debug_left or debug_right:
      html += f"""
        <div style="display: flex; gap: 20px;">
            <div style="width: 40%; padding: 10px; border: 1px solid #ddd; background-color: #f0f0f0;">
                {debug_left or ""}
            </div>
            <div style="width: 40%; padding: 10px; border: 1px solid #ddd; background-color: #f0f0f0;">
                {debug_right or ""}
            </div>
        </div>"""

    display(HTML(html))

# Define your scoring system
This is where we define the set of criteria that we want to use to assess and guide the quality of our blog post generation. We'll focus on a couple categories of quality. We will use the "Contract" functions in the Pi SDK for this.


*   **Content structure:** is the post easy to digest and engaging, and does it guide the user to additional resources?
*  **Technical communication:** does the post use effective code examples and communicate any potential implementation pitfalls or mistakes



In [None]:
# @title Initialize the Pi scoring system from a JSON description
from withpi.types import Contract

blog_writer_scoring_system_json = """
{
  "description": "A streamlined rubric for evaluating technical blog post quality",
  "name": "Technical Blog Post Quality Assessment",
  "dimensions": [
    {
      "description": "Evaluates the content structure of the blog post",
      "label": "Content Structure",
      "sub_dimensions": [
        {
          "description": "Are there visual breaks (images, code snippets, bullet points) to break up the text?",
          "label": "Visual breaks",
          "scoring_type": "PI_SCORER",
          "action_dimension": null,
          "custom_model_id": null,
          "huggingface_url": null,
          "parameters": null,
          "python_code": null,
          "weight": 1
        },
        {
          "description": "Does the blog post address the reader in second person (you, your etc.)?",
          "label": "Second person",
          "scoring_type": "PI_SCORER",
          "action_dimension": null,
          "custom_model_id": null,
          "huggingface_url": null,
          "parameters": null,
          "python_code": null,
          "weight": 1
        },
        {
          "description": "Does the post include links to additional resources or references?",
          "label": "Additional resources",
          "scoring_type": "PI_SCORER",
          "action_dimension": null,
          "custom_model_id": null,
          "huggingface_url": null,
          "parameters": null,
          "python_code": null,
          "weight": 1
        },
        {
          "description": "Are there consistent section headings throughout the post?",
          "label": "Section headings",
          "scoring_type": "PI_SCORER",
          "action_dimension": null,
          "custom_model_id": null,
          "huggingface_url": null,
          "parameters": null,
          "python_code": null,
          "weight": 1
        }
      ],
      "action_dimension": null,
      "weight": 1
    },
    {
      "description": "Evaluates the technical communication of the blog post",
      "label": "Technical Communication",
      "sub_dimensions": [
        {
          "description": "Are code examples included where relevant?",
          "label": "Code inclusion",
          "scoring_type": "PI_SCORER",
          "action_dimension": null,
          "custom_model_id": null,
          "huggingface_url": null,
          "parameters": null,
          "python_code": null,
          "weight": 1
        },
        {
          "description": "Does the post explain the code snippets when they are included?",
          "label": "Code explanation",
          "scoring_type": "PI_SCORER",
          "action_dimension": null,
          "custom_model_id": null,
          "huggingface_url": null,
          "parameters": null,
          "python_code": null,
          "weight": 1
        },
        {
          "description": "Does the post call out potential pitfalls or common mistakes?",
          "label": "Pitfalls",
          "scoring_type": "PI_SCORER",
          "action_dimension": null,
          "custom_model_id": null,
          "huggingface_url": null,
          "parameters": null,
          "python_code": null,
          "weight": 1
        }
      ],
      "action_dimension": null,
      "weight": 1
    }
  ]
}
"""
blog_writer_scoring_system = Contract.model_validate_json(blog_writer_scoring_system_json)

# Try Generating Blog Posts
Once we have a scoring system, let's assess how well prompting a model works for generating blog posts by


1. Define a system prompt
2. Prompt a Llama model to generate responses for a set of user prompts
3. Use our scoring system to compare the generated outputs against a set of actual [blog posts from MachineLearningMastery.com that we'd previously scraped and stored in HuggingFace](https://huggingface.co/datasets/withpi/mlmastery_com_blogs_condensed_merged)
4. Manually inspect some of the differences in the above



In [None]:
# @title Define a system prompt for a blog post generator
system_prompt_for_blog_writer = """
You are a specialized blog post writer. Given a topic, write a technical blog post. Here are specific instructions:
- Make sure that the blog is approximately under 500 words
- The blog should be technical in nature with clear instructions
"""

In [None]:
# @title Define a blog post generator
import litellm
import asyncio

async def generate_blogs(topics, system_prompt, model_id, api_base, api_key, concurrency_limit=5):
    """Generate blogs for all topics with TaskGroup and rate limiting"""
    # Create a semaphore to limit concurrent API calls
    semaphore = asyncio.Semaphore(concurrency_limit)

    async def generate_single_blog(topic, index):
        """Process a single blog generation with rate limiting"""
        async with semaphore:
            try:
                response = await litellm.acompletion(
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": topic},
                    ],
                    model=model_id,
                    api_base=api_base,
                    api_key=api_key,
                    temperature=0.2,
                )
                generated_blog = response.choices[0].message.content
                print(f"Generated a blog for topic# {index}: {topic}")
                return generated_blog
            except Exception as e:
                print(f"Error generating blog for topic# {index}: {e}")
                return f"Error: {str(e)}"

    generated_blogs = []

    # Using TaskGroup for cleaner task management
    async with asyncio.TaskGroup() as tg:
        tasks = [
            tg.create_task(generate_single_blog(topic, i + 1))
            for i, topic in enumerate(topics)
        ]

    # Collect results in the same order as topics
    for task in tasks:
        generated_blogs.append(task.result())

    print("Done generating blogs!!")
    return generated_blogs

In [None]:
# @title Generate blogs using an untrained model for evaluation
from datasets import load_dataset
from google.colab import userdata

ds = load_dataset("withpi/mlmastery_com_blogs_condensed_merged", split="test")
topics = ds["topic"]
actual_blogs = ds["blog"]

# Generate the blogs using an untrained llama 8B
loop = asyncio.get_running_loop()
generated_blogs = await loop.create_task(
    generate_blogs(
        topics,
        model_id="fireworks_ai/llama-v3p1-8b-instruct",
        api_key=userdata.get("FIREWORKS_API_KEY"),
        api_base = None,
        system_prompt=system_prompt_for_blog_writer
    )
)

README.md:   0%|          | 0.00/403 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/322k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/305 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/78 [00:00<?, ? examples/s]

Generated a blog for topic# 5: 7-Day Mini-Course on Practical Data Science: From Linear Regression to Random Forests with Real-World Country Data Analysis
Generated a blog for topic# 4: Evaluating RAG Systems: An Overview of RAGAs and Other Frameworks for Measuring Retrieval Augmented Generation Performance
Generated a blog for topic# 2: Topic: 7 Beginner-Friendly Machine Learning Projects for Hands-On Experience - From Titanic Survival Prediction to Face Detection
Generated a blog for topic# 7: Topic: Implementing Inpainting and Outpainting Techniques with Stable Diffusion Web UI - A Comprehensive Guide to Image Restoration and Expansion
Generated a blog for topic# 1: InterviewAce: 365 Data Science's Free AI-Powered Tool for Data Science Interview Preparation
Generated a blog for topic# 10: Comparing different imputation techniques for handling missing data in machine learning pipelines, including SimpleImputer, IterativeImputer, and KNNImputer, with practical implementation examples 

In [None]:
# @title Compare the generated blogs against actual blogs using the Pi scoring system
from tqdm import tqdm
import pandas as pd

scores = []
actual_scores = []
generated_scores = []
for topic, actual_blog, generated_blog in tqdm(zip(topics, actual_blogs, generated_blogs)):
  actual_score = client.contracts.score(
      llm_input=topic,
      llm_output=actual_blog,
      contract=blog_writer_scoring_system)
  generated_score = client.contracts.score(
      llm_input=topic,
      llm_output=generated_blog,
      contract=blog_writer_scoring_system)
  actual_scores.append(actual_score)
  generated_scores.append(generated_score)
  score = {'topic': topic, 'actual': actual_score.total_score, 'generated': generated_score.total_score}
  scores.append(score)

df = pd.DataFrame(scores)
df

78it [00:41,  1.86it/s]


Unnamed: 0,topic,actual,generated
0,InterviewAce: 365 Data Science's Free AI-Power...,0.452231,0.245919
1,Topic: 7 Beginner-Friendly Machine Learning Pr...,0.468032,0.169446
2,A comprehensive guide to Hugging Face's Model ...,0.601400,0.351490
3,Evaluating RAG Systems: An Overview of RAGAs a...,0.514176,0.171895
4,7-Day Mini-Course on Practical Data Science: F...,0.769043,0.446867
...,...,...,...
73,Packaging and Deploying Python Projects: From ...,0.785645,0.404875
74,Creating and Customizing Dataset Classes in Py...,0.612981,0.371870
75,Implementing Dropout Regularization in PyTorch...,0.804199,0.461594
76,Building and Training a Single Layer Neural Ne...,0.726074,0.471703


In [None]:
# @title Manually inspect actual and generated blogs with scores
def pretty_print_blog(i):
  pretty_print_responses(
      response1 = actual_blogs[i].strip("\"").replace("\\n", "\n"),
      response2 = generated_blogs[i].strip("\"").replace("\\n", "\n"),
      header="##### Topic: \n" + topics[i].strip("\"").replace("\\n", "\n"),
      left_label="Base (actual)",
      right_label="Test (generated)",
      scores_left=actual_scores[i],
      scores_right=generated_scores[i])

for i in range(10):
  pretty_print_blog(i)
  print("\n\n")

0,1,2
Content Structure,,0.851
,Visual breaks,1.0
,Second person,0.402
,Additional resources,1.0
,Section headings,1.0
Technical Communication,,0.054
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.161
,,

0,1,2
Content Structure,,0.431
,Visual breaks,0.0
,Second person,0.535
,Additional resources,0.188
,Section headings,1.0
Technical Communication,,0.061
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.183
,,







0,1,2
Content Structure,,0.936
,Visual breaks,0.742
,Second person,1.0
,Additional resources,1.0
,Section headings,1.0
Technical Communication,,0.001
,Code inclusion,0.001
,Code explanation,0.0
,Pitfalls,0.0
,,

0,1,2
Content Structure,,0.339
,Visual breaks,0.0
,Second person,0.355
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.0
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.0
,,







0,1,2
Content Structure,,0.922
,Visual breaks,0.973
,Second person,0.773
,Additional resources,1.0
,Section headings,0.941
Technical Communication,,0.281
,Code inclusion,0.738
,Code explanation,0.097
,Pitfalls,0.008
,,

0,1,2
Content Structure,,0.678
,Visual breaks,0.0
,Second person,0.996
,Additional resources,0.715
,Section headings,1.0
Technical Communication,,0.025
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.076
,,







0,1,2
Content Structure,,0.991
,Visual breaks,1.0
,Second person,0.965
,Additional resources,1.0
,Section headings,1.0
Technical Communication,,0.037
,Code inclusion,0.003
,Code explanation,0.0
,Pitfalls,0.108
,,

0,1,2
Content Structure,,0.25
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.094
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.281
,,







0,1,2
Content Structure,,0.812
,Visual breaks,0.754
,Second person,1.0
,Additional resources,0.57
,Section headings,0.922
Technical Communication,,0.727
,Code inclusion,0.965
,Code explanation,0.668
,Pitfalls,0.547
,,

0,1,2
Content Structure,,0.309
,Visual breaks,0.0
,Second person,0.236
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.585
,Code inclusion,1.0
,Code explanation,0.754
,Pitfalls,0.0
,,







0,1,2
Content Structure,,0.869
,Visual breaks,0.992
,Second person,0.482
,Additional resources,1.0
,Section headings,1.0
Technical Communication,,0.199
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.598
,,

0,1,2
Content Structure,,0.5
,Visual breaks,0.0
,Second person,0.0
,Additional resources,1.0
,Section headings,1.0
Technical Communication,,0.156
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.469
,,







0,1,2
Content Structure,,1.0
,Visual breaks,1.0
,Second person,1.0
,Additional resources,1.0
,Section headings,1.0
Technical Communication,,0.158
,Code inclusion,0.395
,Code explanation,0.003
,Pitfalls,0.077
,,

0,1,2
Content Structure,,0.307
,Visual breaks,0.222
,Second person,0.007
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.0
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.0
,,







0,1,2
Content Structure,,0.879
,Visual breaks,0.762
,Second person,0.754
,Additional resources,1.0
,Section headings,1.0
Technical Communication,,0.776
,Code inclusion,1.0
,Code explanation,0.957
,Pitfalls,0.371
,,

0,1,2
Content Structure,,0.2
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,0.801
Technical Communication,,0.544
,Code inclusion,0.723
,Code explanation,0.566
,Pitfalls,0.342
,,







0,1,2
Content Structure,,0.758
,Visual breaks,1.0
,Second person,0.031
,Additional resources,1.0
,Section headings,1.0
Technical Communication,,0.003
,Code inclusion,0.007
,Code explanation,0.0
,Pitfalls,0.0
,,

0,1,2
Content Structure,,0.25
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.0
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.0
,,







0,1,2
Content Structure,,0.996
,Visual breaks,1.0
,Second person,1.0
,Additional resources,1.0
,Section headings,0.984
Technical Communication,,0.802
,Code inclusion,1.0
,Code explanation,0.648
,Pitfalls,0.758
,,

0,1,2
Content Structure,,0.25
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.599
,Code inclusion,0.875
,Code explanation,0.516
,Pitfalls,0.406
,,







# Fine Tune a Better Blog Post Generator
Now that we've seen that the MachineLearningMastery posts are still significantly better than our prompt generated posts, let's see if we can capture some of that goodness by fine tuning our own model with examples from the original blog. To do so we will:

1. Download a [dataset of previously scraped posts from HuggingFace](https://huggingface.co/datasets/withpi/mlmastery_com_blogs_condensed_mergedhttps://)

2. Filter the dataset to **just the posts that perform really well per our scoring system** (this is the special sauce)

3. Plug that data into our fine tuning SDK endpoint, which will show us a running log as Fine Tuning improves the model's performance on our scoring system

In [None]:
# @title Prepare training data for Fine-tuning by filtering low scoring blogs (<0.7)
from datasets import load_dataset
import pandas

def score(topic:str, blog:str):
  return client.contracts.score(
      llm_input=topic,
      llm_output=blog,
      contract=blog_writer_scoring_system).total_score

ds = load_dataset("withpi/mlmastery_com_blogs_condensed_merged", split = "train")
ds = ds.map(lambda x: {"score": score(x["topic"], x["blog"])})
ds = ds.filter(lambda x: x["score"] > 0.7)
df = ds.to_pandas()
df



Map:   0%|          | 0/305 [00:00<?, ? examples/s]

Filter:   0%|          | 0/305 [00:00<?, ? examples/s]

Unnamed: 0,topic,blog,score
0,"The topic of this blog post is: ""Understanding...",# The Da Vinci Code of Data: Mastering The Dat...,0.736165
1,Cross-validation techniques for comprehensive ...,# From Train-Test to Cross-Validation: Advanci...,0.960286
2,Automated Feature Engineering in PyCaret: Stre...,# Automated Feature Engineering in PyCaret\n\n...,0.806478
3,Strategies and techniques for handling imbalan...,# Tips for Handling Imbalanced Data in Machine...,0.836202
4,Finding the optimal feature subset for linear ...,# The Search for the Sweet Spot in a Linear Re...,0.910970
...,...,...,...
131,Visualizing PyTorch Model Architectures Using ...,# Visualizing a PyTorch Model\n\nBy [Adrian Ta...,0.849609
132,Understanding and Working with One-Dimensional...,# One-Dimensional Tensors in Pytorch\n\nBy [Mu...,0.723145
133,Making Predictions with Keras: A Guide to Clas...,# How to Make Predictions with Keras\n\nBy [Ja...,0.814941
134,Visualizing and Interpreting Model Training Me...,# Understand Model Behavior During Training by...,0.926432


In [None]:
# @title [SLOW - will run for 80+ minutes] Fine tune the model based on the above training data
status = client.model.sft.start_job(
    scoring_system=blog_writer_scoring_system,
    examples=[
        {"llm_input": row["topic"], "llm_output": row["blog"]}
        for row in ds
    ],
    base_sft_model="LLAMA_3.1_8B",
    lora_config={"lora_rank": "R_16"},
    system_prompt=system_prompt_for_blog_writer,
    num_train_epochs=10,
)
print(status)

SftStatus(detailed_status=['LAUNCHING'], job_id='sft_jobs:babf2fa45c086088a9e43d648f8ef22e58d7584ce21f17fda4509f2421d84c4c:bf8154b7-77b1-485e-88b5-54d2444b7037', state='QUEUED', trained_models=[])


In [None]:
# @title Monitor the fine-tuning job for completion (watch the Eval_Pi_Score increase!)
SFT_JOB_ID = "sft_jobs:babf2fa45c086088a9e43d648f8ef22e58d7584ce21f17fda4509f2421d84c4c:bf8154b7-77b1-485e-88b5-54d2444b7037"
response = stream_response(
    SFT_JOB_ID,
    client.model.sft,
    additional_columns={"Eval_Pi_Score": "contract_score"},
)
if response.state == "ERROR":
  print("The job failed due to:\n{}".format('\n'.join(response.detailed_status[-5:])))
else:
  print("SFT model = {}".format(response.trained_models[0].model_dump_json(indent=2)))

# Test Out & Evaluate Your Fine Tuned Generator

Now our new model is ready to be tested out!

1. First, we'll generate blog posts for the same topics we were looking at before

2. Then we'll score all of these blog posts so we can compare them to the generations by our prompted model

3. Then we'll look at some of the individual examples and their scores Side by Side so we can see how much fine tuning improved our blog posts

In [None]:
# @title Generate blogs using the fine tuned model for evaluation
from datasets import load_dataset
from google.colab import userdata

ds = load_dataset("withpi/mlmastery_com_blogs_condensed_merged", split = "test")
topics = ds["topic"]

# Generate the blogs using fine tuned llama 8B
client.model.sft.load(SFT_JOB_ID)

# Wait for the model to be loaded
while not (client.model.sft.retrieve(GRPO_JOB_ID).trained_models[0].serving_state == "SERVING"):
    time.sleep(3)

loop = asyncio.get_running_loop()
new_generated_blogs = await loop.create_task(
    generate_blogs(
        topics,
        model_id="fireworks_ai/0",
        api_base=f"https://api.withpi.ai/v1/model/sft/{SFT_JOB_ID}",
        api_key=os.environ["WITHPI_API_KEY"],
        system_prompt=system_prompt_for_blog_writer
    )
)

InternalServerError: Error code: 500 - {'exception': ['ValueError: Command failed with exit code 1\n']}

In [None]:
# @title Compare the newly generated blogs against previous ones using the Pi scoring system
from tqdm import tqdm
import pandas as pd

scores = []
generated_scores = []
new_generated_scores = []
for topic, generated_blog, new_generated_blog in tqdm(zip(topics, generated_blogs, new_generated_blogs)):
  generated_score = client.contracts.score(
      llm_input=topic,
      llm_output=generated_blog,
      contract=blog_writer_scoring_system)
  new_generated_score = client.contracts.score(
      llm_input=topic,
      llm_output=new_generated_blog,
      contract=blog_writer_scoring_system)
  generated_scores.append(generated_score)
  new_generated_scores.append(new_generated_score)
  score = {'topic': topic, 'generated': generated_score.total_score, 'new generated': new_generated_score.total_score}
  scores.append(score)

df = pd.DataFrame(scores)
df

78it [00:16,  4.59it/s]


Unnamed: 0,topic,generated,new generated
0,InterviewAce: 365 Data Science's Free AI-Power...,0.125428,0.000020
1,Topic: 7 Beginner-Friendly Machine Learning Pr...,0.188105,0.000781
2,A comprehensive guide to Hugging Face's Model ...,0.296890,0.006804
3,Evaluating RAG Systems: An Overview of RAGAs a...,0.125052,0.001931
4,7-Day Mini-Course on Practical Data Science: F...,0.414086,0.000090
...,...,...,...
73,Packaging and Deploying Python Projects: From ...,0.304791,0.048944
74,Creating and Customizing Dataset Classes in Py...,0.321662,0.045525
75,Implementing Dropout Regularization in PyTorch...,0.389350,0.035131
76,Building and Training a Single Layer Neural Ne...,0.440276,0.037730


In [None]:
# @title Manually inspect new generated blogs against previous ones with scores
def pretty_print_blog(i):
  pretty_print_responses(
      response1 = generated_blogs[i].strip("\"").replace("\\n", "\n"),
      response2 = new_generated_blogs[i].strip("\"").replace("\\n", "\n"),
      header="##### Topic: \n" + topics[i].strip("\"").replace("\\n", "\n"),
      left_label="Base (generated)",
      right_label="Test (new generated)",
      scores_left=generated_scores[i],
      scores_right=new_generated_scores[i])

for i in range(10):
  pretty_print_blog(i)
  print("\n\n")

0,1,2
Content Structure,,0.251
,Visual breaks,0.0
,Second person,0.003
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.0
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.0
,,

0,1,2
Content Structure,,0.0
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,0.0
Technical Communication,,0.0
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.0
,,







0,1,2
Content Structure,,0.376
,Visual breaks,0.0
,Second person,0.504
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.0
,Code inclusion,0.001
,Code explanation,0.0
,Pitfalls,0.0
,,

0,1,2
Content Structure,,0.0
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,0.0
Technical Communication,,0.002
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.004
,,







0,1,2
Content Structure,,0.5
,Visual breaks,0.0
,Second person,1.0
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.094
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.281
,,

0,1,2
Content Structure,,0.0
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,0.0
Technical Communication,,0.014
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.041
,,







0,1,2
Content Structure,,0.25
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.0
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.0
,,

0,1,2
Content Structure,,0.0
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,0.0
Technical Communication,,0.004
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.011
,,







0,1,2
Content Structure,,0.399
,Visual breaks,0.0
,Second person,0.598
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.429
,Code inclusion,0.742
,Code explanation,0.543
,Pitfalls,0.001
,,

0,1,2
Content Structure,,0.0
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,0.0
Technical Communication,,0.0
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.0
,,







0,1,2
Content Structure,,0.25
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.184
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.551
,,

0,1,2
Content Structure,,0.0
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,0.0
Technical Communication,,0.0
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.0
,,







0,1,2
Content Structure,,0.25
,Visual breaks,0.0
,Second person,0.002
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.0
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.0
,,

0,1,2
Content Structure,,0.0
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,0.0
Technical Communication,,0.074
,Code inclusion,0.0
,Code explanation,0.027
,Pitfalls,0.195
,,







0,1,2
Content Structure,,0.25
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.521
,Code inclusion,0.801
,Code explanation,0.609
,Pitfalls,0.154
,,

0,1,2
Content Structure,,0.0
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,0.0
Technical Communication,,0.001
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.003
,,







0,1,2
Content Structure,,0.251
,Visual breaks,0.0
,Second person,0.003
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.0
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.0
,,

0,1,2
Content Structure,,0.0
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,0.0
Technical Communication,,0.0
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.001
,,







0,1,2
Content Structure,,0.25
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,1.0
Technical Communication,,0.637
,Code inclusion,0.93
,Code explanation,0.578
,Pitfalls,0.402
,,

0,1,2
Content Structure,,0.0
,Visual breaks,0.0
,Second person,0.0
,Additional resources,0.0
,Section headings,0.0
Technical Communication,,0.002
,Code inclusion,0.0
,Code explanation,0.0
,Pitfalls,0.006
,,





