In [None]:
csv_fname = ""

In [18]:
reload_output_if_exists = True
self_classify = False

In [19]:
# For processing with pre-determined classifications
classifications = f"""
| Classification | Description |
|---|---|
| Integration | Key issues include difficulting integrating capabilities of cloud services into solutions, ensuring seamless interoperability. |
| Breadth | Key issues include navigating an overwhelming range of service options. |
| Containers | Key issues include challenges in container orchestration, ensuring compatibility in containerized environments.
| Compute | Key issues include optimizing compute resources (e.g., VMs, containers, serverless) for performance and efficient resource utilization. |
| Ease | Key issues include steep learning curves and inconsistent user experiences with cloud services. |
| Portal | Key issues include cumbersome navigation, non-intuitive interfaces, and a lack of intuitive design. |
| High Cost / Pricing | Key issues include unexpected costs, complex pricing models, and difficulty in predicting costs. |
| Skill | Key issues include gaps in technical expertise, steep training requirements, and the need for specialized skills. |
| Customer Support | Key issues include delayed support response times, inconsistent service quality, and limited access to knowledgeable support resources.
| Migration | Key issues include navigating the complexities of migrating workloads between cloud providers, from on-premises environments, or hybrid environments. |
| Privacy | Key issues include data confidentiality concerns, compliance with data protection regulations, and ensuring data security. |
| Database | Key issues include database performance bottlenecks, data scalability challenges, and managing complex database configurations. |
| Documentation | Key issues include outdated or incomplete documentation, lack of examples, and difficulty in finding relevant information. |
| Getting Started | Key issues include overwhelming setup procedures, unclear onboarding processes, and lack of guidance for new users. |
| AI | Key issues include integration challenges with AI into existing solutions, ensuring AI models are accurate and efficient, and managing AI workloads. |
| Missing Features | Key issues include the absence of critical features, limitations in functionality, and gaps in service offerings. |
| Reliability | Key issues include service outages, inconsistent performance, and lack of redundancy in cloud services. |
| Innovate | Key issues include slow adoption of cutting-edge technologies, limited support for emerging features, and difficulty integrating new technologies. |
| Automation | Key issues include complexities in automating cloud services, ensuring automation scripts are reliable, and managing automated workflows. |
| Scaling | Key issues include challenges in scaling resources to meet demand, optimizing resource allocation, and ensuring performance at scale. |
| Lock-In | Key issues include dependency on proprietary cloud services, challenges in migrating away from cloud providers, and concerns about vendor lock-in. |
| Select Service | Key issues include confusion from an abundance of similar services, difficulty in selecting the right service for specific use cases, and lack of clear differentiation between services. |
| Networking | Key issues include complex network configurations, challenges in optimizing network performance, and managing network security. |
| IAM | Key issues include managing intricate identity and access management policies, ensuring secure access controls, and preventing unauthorized access. |
| Monitor | Key issues include setting up comprehensive observability, integrating effective logging and monitoring solutions, and ensuring timely alerts.
| OS | Key issues include compatibility issues with operating systems, managing OS configurations, and ensuring OS security. |
| Multi-Cloud | Key issues include interoperability challenges between cloud providers, managing multiple cloud environments, and ensuring consistent performance across clouds. |
| None | No challenges. |
"""

In [20]:
import os
wdir = os.path.abspath('../../../../')

In [21]:

import json
from dotenv import dotenv_values
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai.types.chat.chat_completion import ChatCompletionMessage
from openai.types.chat.chat_completion_system_message_param import ChatCompletionSystemMessageParam
from openai.types.chat.chat_completion_user_message_param import ChatCompletionUserMessageParam
from typing import Optional, List
from pydantic import BaseModel, Field
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

In [22]:
env_vars = dotenv_values(f"{wdir}/configuration/.env")
notebooks_path = os.path.join(wdir, "src/console/notebooks/cic/")

csv_fpath = os.path.join(notebooks_path, "input", csv_fname)
output_dir = os.path.join(notebooks_path, "output", os.path.splitext(os.path.basename(csv_fname))[0])
classifications_output_dir = os.path.join(output_dir, "self_classifications" if self_classify else "classifications")

credential = DefaultAzureCredential(
    exclude_workload_identity_credential=True,
    exclude_developer_cli_credential=True,
    exclude_environment_credential=True,
    exclude_managed_identity_credential=True,
    exclude_powershell_credential=True,
    exclude_shared_token_cache_credential=True,
    exclude_interactive_browser_credential=True
)

openai_token_provider = get_bearer_token_provider(credential, 'https://cognitiveservices.azure.com/.default')

openai_client = AzureOpenAI(
    azure_endpoint=env_vars["AOAI_ENDPOINT"],
    azure_ad_token_provider=openai_token_provider,
    api_version="2024-12-01-preview"
)

In [23]:
# Timing Helpers
class Stopwatch:
    elapsed = 0
    is_running = False

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.stop()

    def reset(self):
        self.elapsed = 0
        self.is_running = False

    def start(self):
        if self.is_running:
            return

        self.is_running = True
        self.start_time = time.perf_counter()

    def stop(self):
        if not self.is_running:
            return

        self.is_running = False
        self.elapsed = time.perf_counter() - self.start_time

    def get_current_elapsed(self):
        if not self.is_running:
            return self.elapsed

        self.elapsed = time.perf_counter() - self.start_time
        return self.elapsed


# Storage Helpers
class CustomEncoder(json.JSONEncoder):
    def default(self, obj):
        if hasattr(obj, 'to_dict'):
            return obj.to_dict()
        if hasattr(obj, 'as_dict'):
            return obj.as_dict()
        if hasattr(obj, 'model_dump'):
            return obj.model_dump()
        return super().default(obj)


def create_directory(dir: str, clear_if_not_empty: bool = False) -> str:
    os.makedirs(dir, exist_ok=True)

    if clear_if_not_empty:
        for file in os.listdir(dir):
            file_path = os.path.join(dir, file)
            if os.path.isfile(file_path):
                os.remove(file_path)

    return dir


def create_json_file(fpath: str, data: any, indent: int = 4) -> None:
    if not os.path.exists(os.path.dirname(fpath)):
        create_directory(os.path.dirname(fpath))

    with open(fpath, 'w') as f:
        json.dump(data, f, indent=indent, cls=CustomEncoder)
        

# OpenAI Helpers
def get_embedding(text: str):
    embedding_response = openai_client.embeddings.create(
        input=text,
        model=env_vars["EMBEDDING_DEPLOYMENTNAME"]
    )
    return embedding_response.data[0].embedding

## Prompts

In [24]:
self_classify_prompt = """You are a helpful AI assistant for determining a list of key observations from attributions and verbatims from customers of cloud providers (e.g., Azure, AWS, GCP).
The key observations will be used to help program managers, product owners, and engineers understand the most common issues and opportunities for improvement in their products and services.

## On your ability to determine key observations

- Use the provided customer project data to determine the top key observations, regardless of your own knowledge or information.
- Key observations should be specific and detailed to provide the most value.
- Ensure the key observations are based on the evidence provided.
- Each observation should be unique.
- There is no limit to the number of key observations you can provide.

## Key Observation Examples

{{
  "short_name": "High Cost",
}}

{{
  "short_name": "Containers",
}}

{{
  "short_name": "AI",
}}"""

In [25]:
classify_prompt = """Using the provided classification list, classify the attributions and verbatims from customers of cloud providers (e.g., Azure, AWS, GCP) into one or more of the classifications based on the given facts and data.

## On your ability to classify

- Use the data provided to classify, regardless of your own knowledge or information.
- You must only use the classification list provided.
- Ensure the data classifications are based on facts, both quantitative and qualitative.
- You must only classify data based on known facts. Do not assume or provide indication that a classification is associated if detail is not provided.
- Only return relevant data classifications that you are highly confident of.
- There is no limit to the number of data classifications you can provide.
- If no classification is applicable, provide an empty list. Ignore classifications that are not applicable.

## Classifications

{classifications}
"""

In [26]:
fact_prompt = """Below I will present you with a user's request, and potential relevant context to help you solve it.

Based on the user's request, use the context to answer the following survey to the best of your ability.

Here is the user's request:

```
{task}
```

Here is the context:

{context}

Here is the survey:

1. List any specific facts or figures that are GIVEN based on the request. It is possible that there are none.
2. List any facts that are recalled from memory, your knowledge, or well-reasoned assumptions, etc.

When answering this survey, keep in mind that facts will typically be specific details.
Provide as many facts as you can, even if they seem trivial or unimportant.
"""

In [27]:
context_prompt = """Context: {context}

Facts: {facts}"""

## Process request

In [28]:
class Observation(BaseModel):
    short_name: str = Field(description="Short name of the key observation/challenge.")
    key_insights: List[str] = Field(
        description="Detailed list of the key insights that support the observation/challenge, including qualitative and quantitative data.")
    
class Observations(BaseModel):
    observations: List[Observation] = Field(description="List of classifications.")
    
class RequestFactsModel(BaseModel):
    given_facts: Optional[List[str]] = Field(
        description="Any specific facts or figures that are GIVEN based on the request. It is possible that there are none.")
    recalled_facts: Optional[List[str]] = Field(
        description="any facts that are recalled from memory, your knowledge, or well-reasoned assumptions, etc.")

In [29]:
data = []

if reload_output_if_exists and os.path.exists(os.path.join(output_dir)):
    for f in os.listdir(output_dir):
        if f.endswith(".json"):
            with open(os.path.join(output_dir, f), 'r') as file:
                data.append(json.load(file))

In [30]:
if len(data) == 0:    
    try:
        chunk_iterator = pd.read_csv(csv_fpath, chunksize=10)
        
        for i, df in enumerate(chunk_iterator):            
            for j, row in df.iterrows():
                attribution = [
                    row["BrandAssigned"],
                    row["SAM11"],
                    row["TAXALN"],
                    "Other" if row["Q005"].startswith("Other") else row["Q005"],
                    row["Q005_996_TEXT"] if row["Q005"].startswith("Other") else "",
                    row["Q009_"],
                    row["Cloud_Usage"],
                    row["Q082"],
                    row["Q048b"],
                    "" if row["Q089a_2"] == "#NULL!" else "ISV",
                    "" if (row["Q102a"] == "No" or row["Q102a"] == "#NULL!") else "Startup"
                ]
                
                dp = {
                    "response_id": row["ResponseId"],
                    "attribution": ", ".join(attribution),
                    "verbatim": str(row["Q024b"]),
                }
                
                dp["embedding"] = get_embedding(dp["verbatim"])
                data.append(dp)
                
                create_json_file(os.path.join(output_dir, f"{dp['response_id']}.json"), dp)
                
            break; # Exiting based on the first chunk for testing purposes
                
    except Exception as e:
        print(f"Error: {e}")

In [31]:
all_dp_observations = []
processed_response_ids = []

if reload_output_if_exists and os.path.exists(classifications_output_dir):
    for f in os.listdir(classifications_output_dir):
        if f.endswith(".json"):
            with open(os.path.join(classifications_output_dir, f), "r") as file:
                dp_observations = json.load(file)
                all_dp_observations.extend(dp_observations)
                processed_response_ids.append(f.split(".")[0])
                
filtered_data = [d for d in data if d["response_id"] not in processed_response_ids]

In [None]:
for dp in filtered_data:
    with Stopwatch() as sw:
        prompt_tokens = 0
        completion_tokens = 0    

        dp_json = json.dumps({
            "attribution": dp["attribution"],
            "verbatim": dp["verbatim"]
        }, cls=CustomEncoder)
        
        # 1 - Determine any facts from the data
        context_content = context_prompt.format(context=dp_json, facts="To be determined")
        
        planning_messages = [ChatCompletionUserMessageParam(role="user", content=fact_prompt.format(task=self_classify_prompt, context=context_content))]
        
        fact_completion = openai_client.beta.chat.completions.parse(
            model=env_vars["CHATCOMPLETION_DEPLOYMENTNAME"],
            messages=planning_messages,
            response_format=RequestFactsModel,
            temperature=0.3,
            top_p=0.3
        )
        
        facts = fact_completion.choices[0].message.parsed.model_dump()
        prompt_tokens += fact_completion.usage.prompt_tokens
        completion_tokens += fact_completion.usage.completion_tokens

        context_content = context_prompt.format(context=dp_json, facts=facts)

        if self_classify:
            # 2 - Perform self-classification of the input data
            execute_messages = [
                ChatCompletionSystemMessageParam(role="system", content=self_classify_prompt),
                ChatCompletionUserMessageParam(role="user", content=context_content)
            ]

            observation_completion = openai_client.beta.chat.completions.parse(
                model=env_vars["CHATCOMPLETION_DEPLOYMENTNAME"],
                messages=execute_messages,
                response_format=Observations,
                temperature=0.3,
                top_p=0.3
            )
        else:
            # 2 - Classify the input data based on the provided classifications
            execute_messages = [
                ChatCompletionSystemMessageParam(role="system", content=classify_prompt.format(classifications=classifications)),
                ChatCompletionUserMessageParam(role="user", content=context_content)
            ]
            
            observation_completion = openai_client.beta.chat.completions.parse(
                model=env_vars["CHATCOMPLETION_DEPLOYMENTNAME"],
                messages=execute_messages,
                response_format=Observations,
                temperature=0.3,
                top_p=0.3
            ) 
            
        dp_observations = observation_completion.choices[0].message.parsed.model_dump()
        prompt_tokens += observation_completion.usage.prompt_tokens
        completion_tokens += observation_completion.usage.completion_tokens

        # 3 - Get similarity score between the verbatim and the observations            
        for observation in dp_observations["observations"]:
            observation["embedding"] = get_embedding(observation["short_name"])
            similarity = cosine_similarity([dp["embedding"]], [observation["embedding"]])
            observation["similarity"] = round(similarity[0][0], 2)
        
        execution_time = sw.get_current_elapsed()
        
        dp_observations["execution_time"] = execution_time
        dp_observations["prompt_tokens"] = prompt_tokens
        dp_observations["completion_tokens"] = completion_tokens
        
        all_dp_observations.append(dp_observations)
        
        create_json_file(os.path.join(classifications_output_dir, f"{dp['response_id']}.json"), dp_observations)