## Format Validation and Transformation

In [None]:
import json

# Input and output file paths
input_file = "raw_data.jsonl"  # Replace with your raw JSONL file
output_file = "formatted_data.jsonl"  # Output file for transformed data

# Function to validate and transform data
def validate_and_transform(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
        for line_number, line in enumerate(infile, start=1):
            try:
                # Parse the JSON line
                data = json.loads(line)

                # Validate required fields
                question = data.get("question") or data.get("prompt")
                answer = data.get("answer") or data.get("completion")

                if not question or not answer:
                    raise ValueError(f"Missing 'question' or 'answer' fields on line {line_number}")

                # Transform into the expected format
                transformed_data = {
                    "prompt": question.strip(),
                    "completion": answer.strip()
                }

                # Write the transformed data to the output file
                outfile.write(json.dumps(transformed_data, ensure_ascii=False) + "\n")

            except json.JSONDecodeError:
                print(f"Invalid JSON format on line {line_number}. Skipping...")
            except ValueError as e:
                print(f"Validation error on line {line_number}: {e}. Skipping...")

    print(f"Format validation and transformation complete. Output saved to '{output_path}'.")

# Run the validation and transformation
validate_and_transform(input_file, output_file)

## Special Token and Artifact Removal

In [None]:
import json
import re

# Input and output file paths
input_file = "formatted_data.jsonl"  # Replace with your input JSONL file
output_file = "cleaned_data.jsonl"  # Output file for cleaned data

# Function to clean text
def clean_text(text):
    if not text:
        return ""

    # Remove special tokens like [CLS], [SEP], etc.
    text = re.sub(r"\[CLS\]|\[SEP\]", "", text)

    # Remove HTML tags like <a>, <div>, etc.
    text = re.sub(r"<[^>]+>", "", text)

    # Remove control characters like \n, \t, etc.
    text = re.sub(r"[\n\t\r]", " ", text)

    # Remove excessive punctuation (e.g., "!!!", "...", etc.)
    text = re.sub(r"[!?.]{2,}", ".", text)

    # Remove non-standard symbols (e.g., Greek alphabets, mathematical formulas)
    text = re.sub(r"[^\w\s.,!?'-]", "", text)

    # Replace multiple spaces with a single space
    text = re.sub(r"\s{2,}", " ", text)

    # Trim leading and trailing whitespace
    text = text.strip()

    return text

# Function to clean and transform JSONL data
def clean_and_transform(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
        for line_number, line in enumerate(infile, start=1):
            try:
                # Parse the JSON line
                data = json.loads(line)

                # Clean the "prompt" and "completion" fields
                data["prompt"] = clean_text(data.get("prompt", ""))
                data["completion"] = clean_text(data.get("completion", ""))

                # Write the cleaned data to the output file
                outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

            except json.JSONDecodeError:
                print(f"Invalid JSON format on line {line_number}. Skipping...")
            except Exception as e:
                print(f"Error on line {line_number}: {e}. Skipping...")

    print(f"Special token and artifact removal complete. Output saved to '{output_path}'.")

# Run the cleaning and transformation
clean_and_transform(input_file, output_file)

## Token Distribution Analysis

In [None]:
# Since I use Google Colab, I need to set the Hugging Face token as a secret. 
# Make sure you request the access for the Meta's Llama model repo on Hugging Face and set a new token.
from google.colab import userdata
import os

# Retrieve your token from Colab Secrets
hf_token = userdata.get('hf_token') # Make sure to set this secret in your Colab environment, replace 'hf_token' with the key you set

# Set it as an environment variable, which Hugging Face libraries will automatically pick up
os.environ["HF_TOKEN"] = hf_token

In [None]:
from transformers import AutoTokenizer
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Hugging Face model ID
base_model_id = "meta-llama/Llama-3.1-8B-Instruct"

# Load the tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
print("Tokenizer loaded successfully.")

# Path to your JSONL file
jsonl_file_path = "cleaned_data.jsonl"  # Replace with your file path

# Store token counts
sample_token_counts = []
sample_lines_data = []  # Store full dataset lines

# Process JSONL file line by line
with open(jsonl_file_path, "r", encoding="utf-8") as file:
    for line in file:
        data = json.loads(line)  # Parse JSON line
        instruction = data.get("prompt", "")  # Get instruction text
        response = data.get("completion", "")  # Get response text

        # Compute total token count (instruction + response)
        total_tokens = len(tokenizer.encode(instruction, add_special_tokens=True)) + \
                       len(tokenizer.encode(response, add_special_tokens=True))

        sample_token_counts.append(total_tokens)
        sample_lines_data.append(data)  # Store full JSON object

# Convert token count list to NumPy array
sample_token_counts = np.array(sample_token_counts)

# Compute statistical insights
max_tokens = np.max(sample_token_counts)
max_index = np.argmax(sample_token_counts)  # Index of the max value
max_token_entry = sample_lines_data[max_index]  # Retrieve the corresponding JSON object

min_tokens = np.min(sample_token_counts)
min_index = np.argmin(sample_token_counts)  # Index of the min value
min_token_entry = sample_lines_data[min_index]  # Retrieve the corresponding JSON object

mean_tokens = np.mean(sample_token_counts)
median_tokens = np.median(sample_token_counts)
std_dev = np.std(sample_token_counts)
p90 = np.percentile(sample_token_counts, 90)
p95 = np.percentile(sample_token_counts, 95)

# Print statistics
print(f"📊 Token Count Statistics:")
print(f"- Min tokens: {min_tokens}")
print(f"- Max tokens: {max_tokens}")
print(f"- Mean tokens: {mean_tokens:.2f}")
print(f"- Median tokens: {median_tokens}")
print(f"- Standard deviation: {std_dev:.2f}")
print(f"- 90th percentile: {p90}")
print(f"- 95th percentile: {p95}")

# Print details of max and min token entries
print(f"📌 Line with the most tokens ({max_tokens} tokens):")
print(json.dumps(max_token_entry, indent=4, ensure_ascii=False))

print(f"📌 Line with the fewest tokens ({min_tokens} tokens):")
print(json.dumps(min_token_entry, indent=4, ensure_ascii=False))

# --- Visualization ---

# Set seaborn style
sns.set(style="whitegrid")

# 🔹 Histogram Plot (Token Count Distribution)
plt.figure(figsize=(10, 5))
sns.histplot(sample_token_counts, bins=30, kde=True, color="blue")
plt.axvline(p90, color='r', linestyle='dashed', label=f'90th Percentile: {p90:.0f}')
plt.axvline(p95, color='orange', linestyle='dashed', label=f'95th Percentile: {p95:.0f}')
plt.xlabel("Token Count (instruction + response)")
plt.ylabel("Frequency")
plt.title("Histogram of Token Counts")
plt.legend()
plt.show()

# 🔹 Boxplot (Detect Outliers)
plt.figure(figsize=(8, 4))
sns.boxplot(x=sample_token_counts, color="purple")
plt.title("Boxplot of Token Counts")
plt.xlabel("Token Count (instruction + response)")
plt.show()

## Quality Curation and Validation

In [None]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Set up your OpenAI API key
API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
SYSTEM_PROMPT = '''
You are an expert in Generative AI with deep knowledge of the domains of four subtopic: Foundation Models, Responsible AI, Agentic AI and Prompt Engineering.
You will be given a question related to one of these subtopics and an answer to that question. They will be used to instruction fine tune a large language model on Generative AI.
Your tasks are:
1. Fact-check the given answer: Verify the accuracy and comprehensiveness of the provided answer. If you find any inaccuracies, provide the best possible, verified answer to the question.
2. Refine the answer: If the answer is factually correct, your task is to refine it to make it more accurate and comprehensive.

Your answer will be evaluated based on two primary criteria: Accuracy and Comprehensiveness. You MUST ensure your answer have:
- Maximum accuracy: Factually accurate, well-researched and precise.
- High comprehensiveness: Every answer should be reasonably long and detailed. But MUST not too long.
- Target length: MUST be 350 to 400 words.
- Diverse viewpoints and key points: Include multiple key aspects and perspectives with the concise discussions on each.
- Provide ONE most relevant example if possible to support the answer.
- Well-structured format: Use Markdown format for clear formatting and better readability.
- No hallucination: Stick to verifiable facts.

For every answer, you MUST reponse in Markdown format. For example, your answer can follow this format which has some Markdown elements. But you do not need to strictly include all the following elements, just choose which are necessary:
"
## The Introduction or Overview of the answer (MAXIMUM 1 sentence): A straightforward introduction. \n\n
#### Key points for the answer (MAXIMUM 2 key points): \n\n
- **Key point 1**: Discussion on key point 1. \n 
- **Key point 2**: Discussion on key point 2. \n 
#### Different viewpoints or perspectives (MAXIMUM 2 viewpoints): \n\n
- ** Viewpoint 1**: Analysis on viewpoint 1. \n
- ** Viewpoint 2**: Analysis on viewpoint 2. \n
#### Example: (MAXIMUM 1 example): Most relevant example to support the answer.\n\n
## The final conclusion for the answer (MAXIMUM 1 sentence): A concise conclusion.\n\n
"

Important Guidelines for each answer: 
- Include Headings, Titles, Numbers, Bullets, Capital Letters, and Bold or Italics to improve organization and make the answer easier to navigate.
- MUST NOT include special characters like Greek alphabets, mathematical formulas, or any other non-standard symbols.
- When listing items or using bullet list, list MAXIMUM 2 items.
- When doing comparisons, compare on MAXIMUM 2 aspects. Examples are not needed for comparisons.
- For answers explaining many implementation stages, just include the names of the stages. MUST NOT break down into details of each stage.
- Introduction and conclusion is a MUST and should be included in every answer. 

Output only your answer without any additional text or explanation.
'''

In [None]:
import openai
import json
from IPython.display import Markdown

class GPT:
    def __init__(self, system_prompt=SYSTEM_PROMPT, api_key=API_KEY, model='gpt-4o'):
        self.client = openai.OpenAI(api_key=api_key)
        self.model = model
        self.system_prompt = system_prompt
    
    def refine_answer(self, prompt, completion):       
        try: 
            user_prompt = f"Here is the given question:\n{prompt}\n\nHere is the given answer:\n{completion}"
            
            completion = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.4,
                top_p=0.8
            )
            return completion.choices[0].message.content
        except Exception as e:
            print(f"Error refining response: {e}")
            return None

# Read the JSONL file, refine responses, and save to a new JSONL file
def process_jsonl(input_file, output_file):
    gpt = GPT()
    
    with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
        for index, line in enumerate(infile):
            try:
                # Parse the JSON line
                data = json.loads(line)
                instruction = data.get("prompt", "")
                response = data.get("completion", "")
                
                if response:
                    # Refine the response
                    refined_response = gpt.refine_answer(instruction, response)
                    if refined_response:
                        # Create a new JSON object with only instruction and refined response
                        new_data = {"prompt": instruction, "completion": refined_response}
                        # Write the new data to the output file
                        outfile.write(json.dumps(new_data) + "\n")
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON line: {e}")
                print(f"======================= Error Line {index+1}") 

# Example usage
input_file = r"cleaned_data.jsonl"  # Replace with your input JSONL file path
output_file = r"dataset.jsonl"  # Replace with your desired output JSONL file path
process_jsonl(input_file, output_file)