In [1]:
import weaviate
from weaviate.classes.init import Auth
import os
from dotenv import load_dotenv

weaviate_url = os.getenv("WEAVIATE_URL")
weaviate_key = os.getenv("WEAVIATE_API_KEY")
load_dotenv()

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url, auth_credentials=Auth.api_key(weaviate_key)
)

In [None]:
from weaviate.util import generate_uuid5
from datetime import datetime, timezone
import json
from tqdm import tqdm
from helpers import TA_DEMO_COLLECTION, confirm_to_delete
from weaviate.classes.config import Configure, DataType, Property


with open("data/simplified_posts.json", "r") as f:
    data = json.load(f)

if TA_DEMO_COLLECTION == "ForumPostSmall":
    data = data[:20]

confirm_to_delete(client, TA_DEMO_COLLECTION)


client.collections.create(
    TA_DEMO_COLLECTION,
    description="This collection contains conversations from the Weaviate Forum.",
    properties=[
        Property(
            name="user_id",
            description="Unique identifier for the user creating the thread.",
            data_type=DataType.INT,
        ),
        Property(
            name="conversation",
            description="Text of the entire forum conversation thread, truncated to 20,000 characters maximum for context limit.",
            data_type=DataType.TEXT,
        ),
        Property(
            name="conversation_full",
            description="Full text of the entire forum conversation thread.",
            data_type=DataType.TEXT,
        ),
        Property(
            name="date_created",
            description="Date and time when the thread was first created.",
            data_type=DataType.DATE,
        ),
        Property(
            name="has_accepted_answer",
            description="Whether the thread has an accepted answer.",
            data_type=DataType.BOOL,
        ),
        Property(
            name="title",
            description="Title text of the forum thread.",
            data_type=DataType.TEXT,
        ),
        Property(
            name="topic_id",
            description="Unique identifier for the topic of the thread.",
            data_type=DataType.INT,
        ),
    ],
    vectorizer_config=[
        Configure.NamedVectors.text2vec_weaviate(
            name="default", source_properties=["conversation_full", "title"]
        ),
        Configure.NamedVectors.text2vec_weaviate(
            name="title", source_properties=["title"]
        ),
    ],
    replication_config=Configure.replication(factor=3),
    inverted_index_config=Configure.inverted_index(
        index_null_state=True,
        index_timestamps=True,
    ),
)

posts = client.collections.get(TA_DEMO_COLLECTION)

with posts.batch.fixed_size(200) as batch:
    # Add objects to the batch
    for i, row in tqdm(enumerate(data)):
        row["date_created"] = datetime.fromisoformat(row["date_created"]).replace(
            tzinfo=timezone.utc
        )
        if len(row["conversation"]) > 20000:
            row["conversation"] = (
                row["conversation"][:10000] + "..." + row["conversation"][-10000:]
            )
        row["conversation_full"] = row["conversation"]
        batch.add_object(properties=row, uuid=generate_uuid5(row["topic_id"]))

if posts.batch.failed_objects:
    for obj in posts.batch.failed_objects[:5]:
        print(f"Failed to add object {obj['row_id']}: {obj.message}")

print(len(posts))

In [2]:
from weaviate.classes.config import DataType
from weaviate.agents.classes import Operations
from helpers import TECHNICAL_DOMAIN_CATEGORIES, ROOT_CAUSE_CATEGORIES, ACCESS_CONTEXT_CATEGORIES

add_technical_complexity = Operations.append_property(
    property_name="technicalComplexity",
    data_type=DataType.INT,
    view_properties=["conversation"],
    instruction="""
    Rate the technical complexity of the user's forum post query
    on a scale from 1 to 5, where 1 is very simple and 5 is very complex.
    """,
)

In [None]:
add_technical_domain = Operations.append_property(
    property_name="technicalDomain",
    data_type=DataType.TEXT,
    view_properties=["conversation", "title"],
    instruction=f"""
    Identify the primary technical domain of the user's forum post query.
    The answer must be one of the following:
    {TECHNICAL_DOMAIN_CATEGORIES.keys()}

    The definitions of the categories are as follows:
    {TECHNICAL_DOMAIN_CATEGORIES}

    Remember that the answer must be one of these categories:
    {TECHNICAL_DOMAIN_CATEGORIES.keys()}
    """,
)

add_root_cause_category = Operations.append_property(
    property_name="rootCauseCategory",
    data_type=DataType.TEXT,
    view_properties=["conversation", "title"],
    instruction=f"""
    Based on the text, what was the fundamental issue behind the user's question? The answer must be one of the following categories:
    {ROOT_CAUSE_CATEGORIES.keys()}

    The definitions of the categories are as follows:
    {ROOT_CAUSE_CATEGORIES}
    For example, if the user was confused about how to use a specific feature of Weaviate, the answer should be "conceptual_misunderstanding".

    Remember that the answer must be one of these categories:
    {ROOT_CAUSE_CATEGORIES.keys()}
    """,
)

add_access_context = Operations.append_property(
    property_name="accessContext",
    data_type=DataType.TEXT,
    view_properties=["conversation", "title"],
    instruction=f"""
    Based on the text, how was the user trying to access Weaviate? The answer must be one of the following categories:

    {ACCESS_CONTEXT_CATEGORIES.keys()}

    The definitions of the categories are as follows:
    {ACCESS_CONTEXT_CATEGORIES}
    For example, if the user was using the Weaviate Python client library, the answer should be "python_client".

    Remember that the answer must be one of these categories:
    {ACCESS_CONTEXT_CATEGORIES.keys()}
    """,
)

was_it_caused_by_outdated_stack = Operations.append_property(
    property_name="causedByOutdatedStack",
    data_type=DataType.BOOL,
    view_properties=["conversation", "title"],
    instruction="""
    Based on the text, was the user's question caused by an outdated version of Weaviate or its components, such as the client library being used?
    """,
)

was_it_a_documentation_gap = Operations.append_property(
    property_name="isDocumentationGap",
    data_type=DataType.BOOL,
    view_properties=["conversation", "title"],
    instruction="""
    Based on the text, identify whether the user's question was caused by a lack of documentation or unclear instructions regarding Weaviate.

    This does not include cases where the documentation exists, and the user did not find it, or did not read it.
    This also does not include cases where the user was asking about a feature that is not supported by Weaviate,
    or the user was asking about a feature that is not part of a first-party Weaviate product, such as a third-party integration or a custom implementation.
    This also does not include cases where there was a bug in the code, or the user was using an outdated version of Weaviate or its components.

    Only mark this as true if the user was asking about a feature or an aspect
    that is not covered by the documentation, or the documentation was unclear or incorrect.
    """,
)

create_summary = Operations.append_property(
    property_name="summary",
    data_type=DataType.TEXT,
    view_properties=["conversation", "title"],
    instruction="""
    Briefly summarize the user's question and the resolution provided (if any) in a few sentences.
    """,
)

In [None]:
from helpers import TA_DEMO_COLLECTION
from weaviate.agents.classes import TransformationAgent

ta = TransformationAgent(
    client=client,
    collection=TA_DEMO_COLLECTION,
    operations=[
        add_technical_complexity,
        add_technical_domain,
        add_root_cause_category,
        add_access_context,
        was_it_caused_by_outdated_stack,
        was_it_a_documentation_gap,
        create_summary
    ],
)

ta_response = ta.update_all()


In [None]:
from helpers import get_ta_status

get_ta_status(agent_instance=ta, workflow_id=ta_response.workflow_id)

In [None]:
client.close()