In [None]:
import tiktoken

encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
tokens = encoding.encode(prompt)
len(tokens)

In [None]:
from langchain.document_loaders import GitbookLoader

# loader = GitbookLoader("https://app.gitbook.com/o/-MB4weB2E-qpBe07nmSL/s/ShR775Rt7OzHRfy5j2Ks/")
loader = GitbookLoader("https://docs.arize.com/phoenix/", load_all_paths=True)
documents = loader.load()
documents

In [None]:
len(documents)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import GitbookLoader


def build_database(embeddings):
    loader = GitbookLoader("https://docs.arize.com/phoenix/", load_all_paths=True)
    documents = loader.load()
    splitter = RecursiveCharacterTextSplitter()
    texts = splitter.split_documents(documents)
    db = FAISS.from_documents(texts, embeddings)
    return db

def save_database(database, save_path):
    database.save_local(save_path)
    
def load_database(save_path, embeddings):
    return FAISS.load_local(save_path, embeddings)

In [None]:
embeddings = OpenAIEmbeddings()
database_path = "docs_index"

In [None]:
database = build_database(embeddings)
save_database(database, database_path)

In [None]:
database = load_database(database_path, embeddings)

In [None]:
retriever = database.as_retriever()

In [None]:
dataframe

In [None]:
from langchain.

In [None]:
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain.prompts.example_selector import SemanticSimilarityExampleSelector

prompt = FewShotPromptTemplate(
    # The object that will help select examples
    example_selector=example_selector,
    
    # Your prompt
    example_prompt=example_prompt,
    
    # Customizations that will be added to the top and bottom of your prompt
    prefix="Give the location an item is usually found in",
    suffix="Input: {noun}\nOutput:",
    
    # What inputs your prompt will receive
    input_variables=["noun"],
)


In [None]:
docs = retriever.get_relevant_documents("how many phoenix datasets do i need to define?")

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import MarkdownTextSplitter

loader = TextLoader("/Users/xandersong/phoenix/tutorials/schema_examples.md")
documents = loader.load()
splitter = MarkdownTextSplitter()
texts = splitter.split_documents(documents)

In [None]:
print(texts[0].page_content)

In [None]:
documents = loader.load()

In [None]:
examples = [
    {
        "description": "dataframe with timestamp_column_name, prediction_score_column_name, prediction_label_column_name, and actual_label_column_name",
        "dataframe": """pd.DataFrame([
    [pd.to_datetime('2023-03-01 02:02:19'), 0.91, 'click', 'click'],
    [pd.to_datetime('2023-02-17 23:45:48'), 0.37, 'no_click', 'no_click'],
    [pd.to_datetime('2023-01-30 15:30:03'), 0.54, 'click', 'no_click'],
    [pd.to_datetime('2023-02-03 19:56:09'), 0.74, 'click', 'click'],
    [pd.to_datetime('2023-02-24 04:23:43'), 0.37, 'no_click', 'click']
], columns=['timestamp', 'prediction_score', 'prediction', 'target'])""",
        "schema": """px.Schema(
    timestamp_column_name="timestamp",
    prediction_score_column_name="prediction_score",
    prediction_label_column_name="prediction",
    actual_label_column_name="target",
)""",
    },
    {
        "description": "dataframe with prediction_label_column_name, actual_label_column_name, feature_column_names, tag_column_names",
        "dataframe": """pd.DataFrame({
    'fico_score': [578, 507, 656, 414, 512],
    'merchant_id': ['Scammeds', 'Schiller Ltd', 'Kirlin and Sons', 'Scammeds', 'Champlin and Sons'],
    'loan_amount': [4300, 21000, 18000, 18000, 20000],
    'annual_income': [62966, 52335, 94995, 32034, 46005],
    'home_ownership': ['RENT', 'RENT', 'MORTGAGE', 'LEASE', 'OWN'],
    'num_credit_lines': [110, 129, 31, 81, 148],
    'inquests_in_last_6_months': [0, 0, 0, 2, 1],
    'months_since_last_delinquency': [0, 23, 0, 0, 0],
    'age': [25, 78, 54, 34, 49],
    'gender': ['male', 'female', 'female', 'male', 'male'],
    'predicted': ['not_fraud', 'not_fraud', 'uncertain', 'fraud', 'uncertain'],
    'target': ['fraud', 'not_fraud', 'uncertain', 'not_fraud', 'uncertain']
})""",
        "schema": """px.Schema(
    prediction_label_column_name="predicted",
    actual_label_column_name="target",
    feature_column_names=[
        "fico_score",
        "merchant_id",
        "loan_amount",
        "annual_income",
        "home_ownership",
        "num_credit_lines",
        "inquests_in_last_6_months",
        "months_since_last_delinquency",
    ],
    tag_column_names=[
        "age",
        "gender",
    ],
)""",
    },
    {
        "description": "example with prediction_label_column_name, actual_label_column_name, (embedding_feature_column_names with vector_column_name)",
        "dataframe": """pd.DataFrame({
    'predicted': ['fraud', 'fraud', 'not_fraud', 'not_fraud', 'uncertain'],
    'target': ['not_fraud', 'not_fraud', 'not_fraud', 'not_fraud', 'uncertain'],
    'embedding_vector': [[-0.97, 3.98, -0.03, 2.92], [3.20, 3.95, 2.81, -0.09], [-0.49, -0.62, 0.08, 2.03], [1.69, 0.01, -0.76, 3.64], [1.46, 0.69, 3.26, -0.17]],
    'fico_score': [604, 612, 646, 560, 636],
    'merchant_id': ['Leannon Ward', 'Scammeds', 'Leannon Ward', 'Kirlin and Sons', 'Champlin and Sons'],
    'loan_amount': [22000, 7500, 32000, 19000, 10000],
    'annual_income': [100781, 116184, 73666, 38589, 100251],
    'home_ownership': ['RENT', 'MORTGAGE', 'RENT', 'MORTGAGE', 'MORTGAGE'],
    'num_credit_lines': [108, 42, 131, 131, 10],
    'inquests_in_last_6_months': [0, 2, 0, 0, 0],
    'months_since_last_delinquency': [0, 56, 0, 0, 3]
})""",
        "schema": """px.Schema(
    prediction_label_column_name="predicted",
    actual_label_column_name="target",
    embedding_feature_column_names={
        "transaction_embeddings": px.EmbeddingColumnNames(
            vector_column_name="embedding_vector"
        ),
    },
)""",
    },    
    {
        "description": "dataframe with actual_label_column_name, (embedding_feature_column_names with vector_column_name and link_to_data_column_name)",
        "dataframe": """pd.DataFrame({
    'defective': ['okay', 'defective', 'okay', 'defective', 'okay'],
    'image': ['https://www.example.com/image0.jpeg', 'https://www.example.com/image1.jpeg', 'https://www.example.com/image2.jpeg', 'https://www.example.com/image3.jpeg', 'https://www.example.com/image4.jpeg'],
    'image_vector': [[1.73, 2.67, 2.91, 1.79, 1.29], [2.18, -0.21, 0.87, 3.84, -0.97], [3.36, -0.62, 2.40, -0.94, 3.69], [2.77, 2.79, 3.36, 0.60, 3.10], [1.79, 2.06, 0.53, 3.58, 0.24]]
})""",
        "schema": """px.Schema(
    actual_label_column_name="defective",
    embedding_feature_column_names={
        "image_embedding": px.EmbeddingColumnNames(
            vector_column_name="image_vector",
            link_to_data_column_name="image",
        ),
    },
)""",
    },
    {
        "description": "dataframe with actual_label_column_name, feature_column_names, tag_column_names, (embedding_feature_column_names with vector_column_name and raw_data_column_name)",
        "dataframe": """pd.DataFrame({
    'defective': ['okay', 'defective', 'okay', 'defective', 'okay'],
    'image': ['https://www.example.com/image0.jpeg', 'https://www.example.com/image1.jpeg', 'https://www.example.com/image2.jpeg', 'https://www.example.com/image3.jpeg', 'https://www.example.com/image4.jpeg'],
    'image_vector': [[1.73, 2.67, 2.91, 1.79, 1.29], [2.18, -0.21, 0.87, 3.84, -0.97], [3.36, -0.62, 2.40, -0.94, 3.69], [2.77, 2.79, 3.36, 0.60, 3.10], [1.79, 2.06, 0.53, 3.58, 0.24]]
})""",
        "schema": """px.Schema(
    actual_label_column_name="sentiment",
    feature_column_names=[
        "category",
    ],
    tag_column_names=[
        "name",
    ],
    embedding_feature_column_names={
        "product_review_embeddings": px.EmbeddingColumnNames(
            vector_column_name="text_vector",
            raw_data_column_name="text",
        ),
    },
)""",
    },
]

examples_prompt = ""
for example in examples:
    examples_prompt += f"""Example: {example["description"]}
Dataframe:

```python
{example["dataframe"]}
```

Schema:

```python
{example["schema"]}
```
"""
print(examples_prompt)

In [None]:
with open("/Users/xandersong/phoenix/tutorials/api_reference.md") as f:
    api_docs = f.read()

In [None]:
import pandas as pd

dataframe = pd.read_parquet("https://storage.googleapis.com/arize-assets/phoenix/datasets/unstructured/cv/human-actions/human_actions_training.parquet")

sampled_dataframe = dataframe.head(1)
column_to_type = {}
for column in sampled_dataframe.columns:
    column_to_type[column] = repr(type(sampled_dataframe[column].iloc[0]))[8:-2]
dataframe_column_to_type = "\n".join([f"{column}: {type_string}" for column, type_string in column_to_type.items()])
print(dataframe_column_to_type)

In [None]:
template = """Given an input dataframe, suggest a schema that describes that dataframe. I've included the API reference and some examples before giving a description of the input dataframe. Return a syntactic Python code snippet of the form `px.Schema({fill this in})` that is syntactic and can be copy-pasted. Do not use the backtick symbol (`) in your response.

API Documentation:
{api}

Examples:

{ex}

Input Dataframe Columns to Data Type:
{df}

Schema:
"""

print(template)

In [1]:
SystemMessagePromptTemplate.from_template()

NameError: name 'SystemMessagePromptTemplate' is not defined

In [None]:
prompt = PromptTemplate(
    input_variables=["api", "ex", "df"],
    template=template,
).format(api=api_docs, ex=examples_prompt, df=dataframe_column_to_type)
print(prompt)

In [None]:
print(prompt)

In [None]:
from langchain.llms import OpenAI

model_name = "gpt-3.5-turbo"
# model_name="gpt-4"
llm = OpenAI(model_name=model_name)
output = llm(prompt)
output