## Environment Setup - LangSmith

In [None]:
# Install necessary Python packages using pip:
# 1. langsmith - for LangChain tracing, debugging, and evaluation
# 2. openai - to interact with OpenAI models like GPT-3.5, GPT-4, etc.
# 3. ollama - to connect with and use local LLMs like LLaMA/Mistral via the Ollama runtime
!pip install langsmith openai ollama

Collecting ollama
  Downloading ollama-0.5.1-py3-none-any.whl.metadata (4.3 kB)
Downloading ollama-0.5.1-py3-none-any.whl (13 kB)
Installing collected packages: ollama
Successfully installed ollama-0.5.1


In [None]:
# Import the os module to interact with the operating system's environment variables
import os

# Enable LangChain Tracing v2 (used for debugging, visualizing, and monitoring LangChain executions)
os.environ['LANGCHAIN_TRACING_V2'] = 'true'  # Enables the newer version of LangChain tracing

# Set your LangChain API key (this key is required to authenticate with LangChain services)
os.environ['LANGCHAIN_API_KEY'] = "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"


# Set the project name under which all traces will be grouped in LangSmith (LangChain's platform for tracing & evaluation)
os.environ['LANGCHAIN_PROJECT'] = 'Test'


## Manually Curated Dataset

In [None]:
# Import the pandas library for data manipulation and CSV operations
import pandas as pd

# -------------------------
# Step 1: Define the QA pairs
# -------------------------

# List of input questions related to DBRX model
inputs = [
    "How many tokens was DBRX pre-trained on?",
    "Is DBRX a MOE model and how many parameters does it have?",
    "How many GPUs was DBRX trained on and what was the connectivity between GPUs?",
]

# Corresponding answers to the questions above
outputs = [
    "DBRX was pre-trained on 12 trillion tokens of text and code data.",
    "Yes, DBRX is a fine-grained mixture-of-experts (MoE) architecture with 132B total parameters.",
    "DBRX was trained on 3072 NVIDIA H100s connected by 3.2Tbps Infiniband",
]

# -------------------------
# Step 2: Create a structured dataset
# -------------------------

# Combine questions and answers into a list of dictionaries (each with 'question' and 'answer' keys)
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]

# Convert the list of dictionaries into a pandas DataFrame (tabular format)
df = pd.DataFrame(qa_pairs)

# -------------------------
# Step 3: Save the dataset to a CSV file
# -------------------------

# Define the full path where the CSV file will be saved (change the path as needed)
csv_path = "F:\\LangSmith\\DBRX_eval.csv"

# Write the DataFrame to a CSV file without including the index column
df.to_csv(csv_path, index=False)

# At this point, you can upload the generated CSV file to LangSmith for evaluation or dataset creation


In [None]:
# Import the LangSmith client for interacting with the LangSmith platform
from langsmith import Client

# Initialize the LangSmith client (uses environment variables for authentication)
client = Client()

# Define the name of the dataset to be created in LangSmith
dataset_name = "DBRX"

# -------------------------------
# Step 1: Create a new dataset
# -------------------------------

# Create a new dataset on LangSmith with a name and description
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="QA pairs about DBRX model.",  # Describes the purpose of the dataset
)

# -------------------------------
# Step 2: Upload examples to the dataset
# -------------------------------

# Upload the list of input-output pairs as examples to the dataset
# `inputs` and `outputs` must be lists of dictionaries, one per example
client.create_examples(
    inputs=[{"question": q} for q in inputs],   # Convert input questions to dict format
    outputs=[{"answer": a} for a in outputs],   # Convert output answers to dict format
    dataset_id=dataset.id                      # Associate the examples with the created dataset
)


{'example_ids': ['c8b8d557-a1c7-48a2-9a5c-8083b6efd974',
  'df818c21-02de-478c-93dc-013a21dbe990',
  '2e1ec321-d8b9-4fd9-87c9-0a87841696ec'],
 'count': 3}

In [None]:
# Define additional QA pairs to be appended to the existing dataset
new_questions = [
    "What is the context window of DBRX Instruct?",
]

new_answers = [
    "DBRX Instruct was trained with up to a 32K token context window.",
]

# ----------------------------------------
# Add new examples to the existing dataset
# ----------------------------------------

# Use the same client and dataset ID to append the new question-answer pairs
# This allows you to incrementally build your dataset over time
client.create_examples(
    inputs=[{"question": q} for q in new_questions],   # Format the new question(s)
    outputs=[{"answer": a} for a in new_answers],       # Format the new answer(s)
    dataset_id=dataset.id                              # Link to the existing dataset
)


{'example_ids': ['3d6cd2f6-edc5-4c90-a123-5186cfc39909'], 'count': 1}