In [None]:
csv_fname = ""

In [2]:
import os
wdir = os.path.abspath('../../../../')

In [3]:
import json
from dotenv import dotenv_values
import pandas as pd

In [4]:
# Storage Helpers
class CustomEncoder(json.JSONEncoder):
    def default(self, obj):
        if hasattr(obj, 'to_dict'):
            return obj.to_dict()
        if hasattr(obj, 'as_dict'):
            return obj.as_dict()
        if hasattr(obj, 'model_dump'):
            return obj.model_dump()
        return super().default(obj)


def create_directory(dir: str, clear_if_not_empty: bool = False) -> str:
    os.makedirs(dir, exist_ok=True)

    if clear_if_not_empty:
        for file in os.listdir(dir):
            file_path = os.path.join(dir, file)
            if os.path.isfile(file_path):
                os.remove(file_path)

    return dir


def create_json_file(fpath: str, data: any, indent: int = 4) -> None:
    if not os.path.exists(os.path.dirname(fpath)):
        create_directory(os.path.dirname(fpath))

    with open(fpath, 'w') as f:
        json.dump(data, f, indent=indent, cls=CustomEncoder)

In [5]:
env_vars = dotenv_values(f"{wdir}/configuration/.env")
notebooks_path = os.path.join(wdir, "src/console/notebooks/cic/")

csv_fpath = os.path.join(notebooks_path, "input", csv_fname)
output_dir = os.path.join(notebooks_path, "output", os.path.splitext(os.path.basename(csv_fname))[0])

# credential = DefaultAzureCredential(
#     exclude_workload_identity_credential=True,
#     exclude_developer_cli_credential=True,
#     exclude_environment_credential=True,
#     exclude_managed_identity_credential=True,
#     exclude_powershell_credential=True,
#     exclude_shared_token_cache_credential=True,
#     exclude_interactive_browser_credential=True
# )

# openai_token_provider = get_bearer_token_provider(credential, 'https://cognitiveservices.azure.com/.default')

# openai_client = AzureOpenAI(
#     azure_endpoint=env_vars["AOAI_ENDPOINT"],
#     azure_ad_token_provider=openai_token_provider,
#     api_version="2024-12-01-preview"
# )

In [6]:
data = []

try:
    chunk_iterator = pd.read_csv(csv_fpath, chunksize=500)
    
    for i, df in enumerate(chunk_iterator):
        for j, row in df.iterrows():
            attribution = [
                row["BrandAssigned"],
                row["SAM11"],
                row["TAXALN"],
                "Other" if row["Q005"].startswith("Other") else row["Q005"],
                row["Q005_996_TEXT"] if row["Q005"].startswith("Other") else "",
                row["Q009_"],
                row["Cloud_Usage"],
                row["Q082"],
                row["Q048b"],
                "" if row["Q089a_2"] == "#NULL!" else "ISV",
                "" if (row["Q102a"] == "No" or row["Q102a"] == "#NULL!") else "Startup"
            ]
                        
            data.append({
                "response_id": row["ResponseId"],
                "attribution": ", ".join(attribution),
                "verbatim": row["Q024b"]
            })
except Exception as e:
    print(f"Error: {e}")
    
create_json_file(os.path.join(output_dir, "data.json"), data)