In [None]:
csv_fname = ""

In [None]:
reload_output_if_exists = True 
"""Indicates whether to reload any outputs from the Notebook if they already exist."""

self_classify = False
"""Indicates whether to use the LLM to determine its own classifications for the data, or to use the pre-defined `classifications`."""

In [None]:
classifications = [
    {
        "classification": "Integration",
        "description": "Key issues include difficulties integrating capabilities of cloud services into solutions, ensuring seamless interoperability."
    },
    {
        "classification": "Breadth",
        "description": "Key issues include navigating an overwhelming range of service options."
    },
    {
        "classification": "Containers",
        "description": "Key issues include challenges in container orchestration, ensuring compatibility in containerized environments."
    },
    {
        "classification": "Compute",
        "description": "Key issues include optimizing compute resources (e.g., VMs, containers, serverless) for performance and efficient resource utilization."
    },
    {
        "classification": "Ease",
        "description": "Key issues include steep learning curves and inconsistent user experiences with cloud services."
    },
    {
        "classification": "Portal",
        "description": "Key issues include cumbersome navigation, non-intuitive interfaces, and a lack of intuitive design."
    },
    {
        "classification": "High Cost / Pricing",
        "description": "Key issues include unexpected costs, complex pricing models, and difficulty in predicting costs."
    },
    {
        "classification": "Skill",
        "description": "Key issues include gaps in technical expertise, steep training requirements, and the need for specialized skills."
    },
    {
        "classification": "Customer Support",
        "description": "Key issues include delayed support response times, inconsistent service quality, and limited access to knowledgeable support resources."
    },
    {
        "classification": "Migration",
        "description": "Key issues include navigating the complexities of migrating workloads between cloud providers, from on-premises environments, or hybrid environments."
    },
    {
        "classification": "Privacy",
        "description": "Key issues include data confidentiality concerns, compliance with data protection regulations, and ensuring data security."
    },
    {
        "classification": "Database",
        "description": "Key issues include database performance bottlenecks, data scalability challenges, and managing complex database configurations."
    },
    {
        "classification": "Documentation",
        "description": "Key issues include outdated or incomplete documentation, lack of examples, and difficulty in finding relevant information."
    },
    {
        "classification": "Getting Started",
        "description": "Key issues include overwhelming setup procedures, unclear onboarding processes, and lack of guidance for new users."
    },
    {
        "classification": "AI",
        "description": "Key issues include integration challenges with AI into existing solutions, ensuring AI models are accurate and efficient, and managing AI workloads."
    },
    {
        "classification": "Missing Features",
        "description": "Key issues include the absence of critical features, limitations in functionality, and gaps in service offerings."
    },
    {
        "classification": "Reliability",
        "description": "Key issues include service outages, inconsistent performance, and lack of redundancy in cloud services."
    },
    {
        "classification": "Innovate",
        "description": "Key issues include slow adoption of cutting-edge technologies, limited support for emerging features, and difficulty integrating new technologies."
    },
    {
        "classification": "Automation",
        "description": "Key issues include complexities in automating cloud services, ensuring automation scripts are reliable, and managing automated workflows."
    },
    {
        "classification": "Scaling",
        "description": "Key issues include challenges in scaling resources to meet demand, optimizing resource allocation, and ensuring performance at scale."
    },
    {
        "classification": "Lock-In",
        "description": "Key issues include dependency on proprietary cloud services, challenges in migrating away from cloud providers, and concerns about vendor lock-in."
    },
    {
        "classification": "Select Service",
        "description": "Key issues include confusion from an abundance of similar services, difficulty in selecting the right service for specific use cases, and lack of clear differentiation between services."
    },
    {
        "classification": "Networking",
        "description": "Key issues include complex network configurations, challenges in optimizing network performance, and managing network security."
    },
    {
        "classification": "IAM",
        "description": "Key issues include managing intricate identity and access management policies, ensuring secure access controls, and preventing unauthorized access."
    },
    {
        "classification": "Monitor",
        "description": "Key issues include setting up comprehensive observability, integrating effective logging and monitoring solutions, and ensuring timely alerts."
    },
    {
        "classification": "OS",
        "description": "Key issues include compatibility issues with operating systems, managing OS configurations, and ensuring OS security."
    },
    {
        "classification": "Multi-Cloud",
        "description": "Key issues include interoperability challenges between cloud providers, managing multiple cloud environments, and ensuring consistent performance across clouds."
    },
    {
        "classification": "None",
        "description": "No challenges."
    }
]
"""The classifications to use for the data. If `self_classify` is False, these classifications will be used to classify the data. Otherwise, they will be ignored."""

In [None]:
import os
wdir = os.path.abspath('../../../../')

In [None]:

import json
from dotenv import dotenv_values
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai.types.chat.chat_completion_system_message_param import ChatCompletionSystemMessageParam
from openai.types.chat.chat_completion_user_message_param import ChatCompletionUserMessageParam
from typing import Optional, List
from pydantic import BaseModel, Field
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
env_vars = dotenv_values(f"{wdir}/configuration/.env")
notebooks_path = os.path.join(wdir, "src/console/notebooks/cic/")

input_dir = os.path.join(notebooks_path, "input")
csv_fpath = os.path.join(input_dir, csv_fname)
output_dir = os.path.join(notebooks_path, "output", os.path.splitext(os.path.basename(csv_fname))[0])
classifications_output_dir = os.path.join(output_dir, "self_classifications" if self_classify else "classifications")

credential = DefaultAzureCredential(
    exclude_workload_identity_credential=True,
    exclude_developer_cli_credential=True,
    exclude_environment_credential=True,
    exclude_managed_identity_credential=True,
    exclude_powershell_credential=True,
    exclude_shared_token_cache_credential=True,
    exclude_interactive_browser_credential=True
)

openai_token_provider = get_bearer_token_provider(credential, 'https://cognitiveservices.azure.com/.default')

openai_client = AzureOpenAI(
    azure_endpoint=env_vars["AOAI_ENDPOINT"],
    azure_ad_token_provider=openai_token_provider,
    api_version="2024-12-01-preview"
)

In [None]:
# Timing Helpers
class Stopwatch:
    elapsed = 0
    is_running = False

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.stop()

    def reset(self):
        self.elapsed = 0
        self.is_running = False

    def start(self):
        if self.is_running:
            return

        self.is_running = True
        self.start_time = time.perf_counter()

    def stop(self):
        if not self.is_running:
            return

        self.is_running = False
        self.elapsed = time.perf_counter() - self.start_time

    def get_current_elapsed(self):
        if not self.is_running:
            return self.elapsed

        self.elapsed = time.perf_counter() - self.start_time
        return self.elapsed


# Storage Helpers
class CustomEncoder(json.JSONEncoder):
    def default(self, obj):
        if hasattr(obj, 'to_dict'):
            return obj.to_dict()
        if hasattr(obj, 'as_dict'):
            return obj.as_dict()
        if hasattr(obj, 'model_dump'):
            return obj.model_dump()
        return super().default(obj)


def create_directory(dir: str, clear_if_not_empty: bool = False) -> str:
    os.makedirs(dir, exist_ok=True)

    if clear_if_not_empty:
        for file in os.listdir(dir):
            file_path = os.path.join(dir, file)
            if os.path.isfile(file_path):
                os.remove(file_path)

    return dir


def create_json_file(fpath: str, data: any, indent: int = 4) -> None:
    if not os.path.exists(os.path.dirname(fpath)):
        create_directory(os.path.dirname(fpath))

    with open(fpath, 'w') as f:
        json.dump(data, f, indent=indent, cls=CustomEncoder)
        

# OpenAI Helpers
def get_embedding(text: str):
    embedding_response = openai_client.embeddings.create(
        input=text,
        model=env_vars["EMBEDDING_DEPLOYMENTNAME"]
    )
    return embedding_response.data[0].embedding

## Prompts

### Self-Classification

This prompting technique will use the LLM to determine the key observations/challenges associated with the data point provided. The criteria for determining classifications uses logic regarding common issues and opportunities to improve cloud products/services.

In [None]:
self_classify_prompt = """You are a helpful AI assistant for determining a list of key observations from attributions and verbatims from customers of cloud providers (e.g., Azure, AWS, GCP).
The key observations will be used to help program managers, product owners, and engineers understand the most common issues and opportunities for improvement in their products and services.

## On your ability to determine key observations

- Use the provided customer project data to determine the top key observations, regardless of your own knowledge or information.
- Key observations should be specific and detailed to provide the most value.
- Ensure the key observations are based on the evidence provided.
- Each observation should be unique.
- Provide a maximum of 5 key observations.
- There is no limit to the number of key observations you can provide.

## Key Observation Examples

{{
  "short_name": "High Cost",
}}

{{
  "short_name": "Containers",
}}

{{
  "short_name": "AI",
}}"""

### Pre-Defined Classification

This prompting technique takes a list of pre-defined classification names and descriptions to help the model narrow down the scope of the classification. The output will be based on the most relevant matches to the list.

In [None]:
classify_prompt = """Using the provided classification list, classify the attributions and verbatims from customers of cloud providers (e.g., Azure, AWS, GCP) into one or more of the classifications based on the given facts and data.

## On your ability to classify

- Use the data provided to classify, regardless of your own knowledge or information.
- You must only use the classification list provided.
- Ensure the data classifications are based on facts, both quantitative and qualitative.
- You must only classify data based on known facts. Do not assume or provide indication that a classification is associated if detail is not provided.
- Only return relevant data classifications that you are highly confident of.
- Provide a maximum of 5 classifications per data point.
- If no classification is applicable, provide an empty list. Ignore classifications that are not applicable.

## Classifications

{classifications}
"""

### Fact Generation

To support both classification techniques, this prompting technique will use the model to extract key facts from the provided context, and make logical inferences based on the context to create a fact sheet. This will be used in conjunction with the context to reduce the chances of hallucination, improving the quality of the output.

In [None]:
fact_prompt = """Below I will present you with a user's request, and potential relevant context to help you solve it.

Based on the user's request, use the context to answer the following survey to the best of your ability.

Here is the user's request:

```
{task}
```

Here is the context:

{context}

Here is the survey:

1. List any specific facts or figures that are GIVEN based on the request. It is possible that there are none.
2. List any facts that are recalled from memory, your knowledge, or well-reasoned assumptions, etc.

When answering this survey, keep in mind that facts will typically be specific details.
Provide as many facts as you can, even if they seem trivial or unimportant.
"""

In [None]:
context_prompt = """Context: {context}

Facts: {facts}"""

## Process request

In [None]:
class Observation(BaseModel):
    short_name: str = Field(description="Short name of the key observation/challenge.")
    key_insights: List[str] = Field(
        description="Detailed list of the key insights that support the observation/challenge, including qualitative and quantitative data.")
    
class Observations(BaseModel):
    observations: List[Observation] = Field(description="List of classifications.")
    
class RequestFactsModel(BaseModel):
    given_facts: Optional[List[str]] = Field(
        description="Any specific facts or figures that are GIVEN based on the request. It is possible that there are none.")
    recalled_facts: Optional[List[str]] = Field(
        description="any facts that are recalled from memory, your knowledge, or well-reasoned assumptions, etc.")

### [Pre-Determined Classification] - Generate classification embeddings

When adopting the non-self classification technique, in order to determine the similarity of the LLM classifications to the context provided, first generate the embeddings for the classification descriptions.

These will be used later for cosine similarity calculations.

In [None]:
if reload_output_if_exists and os.path.exists(os.path.join(input_dir, "classifications.json")):
    classifications = json.load(open(os.path.join(input_dir, "classifications.json"), "r"))
    
classifications_updated = False

if not self_classify:
    for classification in classifications:
        if "embedding" not in classification:
            classification["embedding"] = get_embedding(classification["description"])
            classifications_updated = True

if classifications_updated:
    create_json_file(os.path.join(input_dir, "classifications.json"), classifications)

### Load previous data

To reduce reliance on consistently loading data out of an input CSV file, the data stored as JSON in the output directory is loaded back into the notebook.

In [None]:
data = []

if reload_output_if_exists and os.path.exists(os.path.join(output_dir)):
    for f in os.listdir(output_dir):
        if f.endswith(".json"):
            with open(os.path.join(output_dir, f), 'r') as file:
                data.append(json.load(file))
                
print(f"Loaded {len(data)} existing data points")

### Load previous data classifications

Similarly, the classifications for each data point are loaded back into the notebook so not to process them again.

In [None]:
all_dp_observations = []
processed_response_ids = []

if reload_output_if_exists and os.path.exists(classifications_output_dir):
    for f in os.listdir(classifications_output_dir):
        if f.endswith(".json"):
            with open(os.path.join(classifications_output_dir, f), "r") as file:
                dp_observations = json.load(file)
                all_dp_observations.extend(dp_observations)
                processed_response_ids.append(f.split(".")[0])
                
print(f"Loaded {len(all_dp_observations)} existing data point classifications")

### Load data points from CSV

For any data points not yet processed, we load from the CSV file. 

The `skip_chunks` variable is used to control which data points to load for processing. This is for testing purposes only to not process 1,000s of data points.

An embedding is created for each data point which will be later used to determine the similarity to the classification embeddings.

The data points are then saved out to the output directory as JSON.

In [None]:
try:
    chunk_iterator = pd.read_csv(csv_fpath, chunksize=10)
    
    skip_chunks = 0
    for _ in range(skip_chunks):
        next(chunk_iterator, None)
        
    df = next(chunk_iterator, None)
    if df is not None:
        for j, row in df.iterrows():
            if row["ResponseId"] in processed_response_ids or row["ResponseId"] in [dp["response_id"] for dp in data]:
                continue
            
            attribution = [
                row["BrandAssigned"],
                row["SAM11"],
                row["TAXALN"],
                "Other" if row["Q005"].startswith("Other") else row["Q005"],
                row["Q005_996_TEXT"] if row["Q005"].startswith("Other") else "",
                row["Q009_"],
                row["Cloud_Usage"],
                row["Q082"],
                row["Q048b"],
                "" if row["Q089a_2"] == "#NULL!" else "ISV",
                "" if (row["Q102a"] == "No" or row["Q102a"] == "#NULL!") else "Startup"
            ]
            
            dp = {
                "response_id": row["ResponseId"],
                "attribution": ", ".join(attribution),
                "verbatim": str(row["Q024b"]),
            }
            
            dp["embedding"] = get_embedding(dp["verbatim"])
            data.append(dp)
            
            create_json_file(os.path.join(output_dir, f"{dp['response_id']}.json"), dp)
except Exception as e:
    print(f"Error: {e}")

In [None]:
filtered_data = [d for d in data if d["response_id"] not in processed_response_ids]

print(f"Determined {len(filtered_data)} new data points for classification")

### Classification via LLM

For each data point, the LLM is used to determine the key observations/challenges associated with it.

First, the fact sheet is generated for the data point.

Next, the fact sheet and context are concatenated and passed to the LLM for classification.

After classification, each data point classification embedding is compared to the original context to determine the similarity for that classification. _Note: We follow this approach rather than using the LLM, as LLMs are not well-suited for determining numeric similarity between text. Prompt the LLM to return a score would result in wild variations in the scores returned._

#### Self-Classification

For the self-classification scenario, the LLM is used to determine its own key observations/challenges associated with the data point provided. The criteria for determining classifications uses logic regarding common issues and opportunities to improve cloud products/services.

#### Pre-Defined Classification

For the pre-defined classification scenario, the `classifications` are used to narrow down the scope of the classification using the LLM. The output will be based on the most relevant matches to the list.

In [None]:
for dp in filtered_data:
    with Stopwatch() as sw:
        prompt_tokens = 0
        completion_tokens = 0    

        dp_json = json.dumps({
            "attribution": dp["attribution"],
            "verbatim": dp["verbatim"]
        }, cls=CustomEncoder)
        
        # 1 - Determine any facts from the data
        context_content = context_prompt.format(context=dp_json, facts="To be determined")
        
        planning_messages = [ChatCompletionUserMessageParam(role="user", content=fact_prompt.format(task=self_classify_prompt, context=context_content))]
        
        fact_completion = openai_client.beta.chat.completions.parse(
            model=env_vars["CHATCOMPLETION_DEPLOYMENTNAME"],
            messages=planning_messages,
            response_format=RequestFactsModel,
            temperature=0.3,
            top_p=0.3
        )
        
        facts = fact_completion.choices[0].message.parsed.model_dump()
        prompt_tokens += fact_completion.usage.prompt_tokens
        completion_tokens += fact_completion.usage.completion_tokens

        context_content = context_prompt.format(context=dp_json, facts=facts)

        if self_classify:
            # 2 - Perform self-classification of the input data
            execute_messages = [
                ChatCompletionSystemMessageParam(role="system", content=self_classify_prompt),
                ChatCompletionUserMessageParam(role="user", content=context_content)
            ]

            observation_completion = openai_client.beta.chat.completions.parse(
                model=env_vars["CHATCOMPLETION_DEPLOYMENTNAME"],
                messages=execute_messages,
                response_format=Observations,
                temperature=0.3,
                top_p=0.3
            )
        else:
            # 2 - Classify the input data based on the provided classifications
            classifications_json = [{"classification": c["classification"], "description": c["description"]} for c in classifications]
            
            execute_messages = [
                ChatCompletionSystemMessageParam(role="system", content=classify_prompt.format(classifications=json.dumps(classifications_json))),
                ChatCompletionUserMessageParam(role="user", content=context_content)
            ]
            
            observation_completion = openai_client.beta.chat.completions.parse(
                model=env_vars["CHATCOMPLETION_DEPLOYMENTNAME"],
                messages=execute_messages,
                response_format=Observations,
                temperature=0.3,
                top_p=0.3
            ) 
            
        dp_observations = observation_completion.choices[0].message.parsed.model_dump()
        prompt_tokens += observation_completion.usage.prompt_tokens
        completion_tokens += observation_completion.usage.completion_tokens

        # 3 - Get similarity score between the verbatim and the observations            
        def process_observation(obs, dp_embedding):
            if self_classify:
                embedding = get_embedding(obs["short_name"])
            else:
                embedding = next(c["embedding"] for c in classifications if c["classification"] == obs["short_name"])
            similarity = cosine_similarity([dp_embedding], [embedding])
            obs["embedding"] = embedding
            obs["similarity"] = round(similarity[0][0], 1)
            return obs

        with ThreadPoolExecutor() as executor:
            dp_observations["observations"] = list(
                executor.map(lambda obs: process_observation(obs, dp["embedding"]), dp_observations["observations"])
            )
                    
        execution_time = sw.get_current_elapsed()
        
        dp_observations["response_id"] = dp["response_id"]
        dp_observations["execution_time"] = execution_time
        dp_observations["prompt_tokens"] = prompt_tokens
        dp_observations["completion_tokens"] = completion_tokens
        
        all_dp_observations.append(dp_observations)
        
        create_json_file(os.path.join(classifications_output_dir, f"{dp['response_id']}.json"), dp_observations)

In [None]:
output_csv_fpath = os.path.join(output_dir, "output.csv")

output_data = []

for dp in data:
    dp_classifications = next(d for d in all_dp_observations if d["response_id"] == dp["response_id"])
    dp_keywords = ", ".join([obs["short_name"] for obs in dp_classifications["observations"]])
    dp_scores = ", ".join([f"{obs['short_name']}: {obs['similarity']}" for obs in dp_classifications["observations"]])
    
    output_data.append([dp["response_id"], dp["attribution"], dp["verbatim"], dp_keywords, dp_scores])
    
output_df = pd.DataFrame(output_data, columns=["ResponseId", "Attribution", "Verbatim", "Keyword", "Model Classification Scores"])
output_df.to_csv(output_csv_fpath, index=False)