# Classify Documents

In [1]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load Data
The next cell will load embeddings generated in notebook [01-get-embeddings.ipynb](./01-get-embeddings.ipynb).

In [28]:
import pandas as pd

df_orig = pd.read_csv("bbc-news-data-embedding.csv", delimiter='\t', index_col=False)

In [29]:
df = df_orig.copy()
df

Unnamed: 0,category,filename,title,content,embedding
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,"[-0.02130456641316414, -0.01682969368994236, -..."
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,"[-0.024546362459659576, -0.013037018477916718,..."
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,"[-0.021691035479307175, -0.03697184473276138, ..."
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,"[-0.021805426105856895, -0.016833839938044548,..."
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,"[-0.008381073363125324, -0.008448663167655468,..."
...,...,...,...,...,...
2241,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...,
2242,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...,
2243,tech,399.txt,Be careful how you code,A new European directive could put software w...,
2244,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...,


In [30]:
# drop rows with NaN
df.dropna(inplace=True)
len(df)

20

## Deploy a Model

In [21]:
import os
from azure.identity import DefaultAzureCredential
from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient

os.environ['AZURE_CLIENT_ID']=""
os.environ['AZURE_TENANT_ID']=""
os.environ['AZURE_CLIENT_SECRET']=""

# list models deployed with embeddings capability
deployment_id = None
client = CognitiveServicesManagementClient(
        credential=DefaultAzureCredential(),
        subscription_id="",
    )

response = client.deployments.begin_create_or_update(
        resource_group_name="cloud-shell-storage-eastus",
        account_name="tsisodia-openai",
        deployment_name="tsisodia-doc-classify",
        deployment={
            "properties": {"model": {"format": "OpenAI", "name": "gpt-35-turbo", "version": "0301"}},
            "sku": {"capacity": 2, "name": "Standard"},
        },
    )

response = client.deployments.list(
        resource_group_name="cloud-shell-storage-eastus",
        account_name="tsisodia-openai",
    )
for item in response:
    print(item)

{'additional_properties': {}, 'id': '/subscriptions/99d7836d-942d-4759-80b6-2ed3b80b49e9/resourceGroups/cloud-shell-storage-eastus/providers/Microsoft.CognitiveServices/accounts/tsisodia-openai/deployments/tsisodia-gpt4o', 'name': 'tsisodia-gpt4o', 'type': 'Microsoft.CognitiveServices/accounts/deployments', 'sku': <azure.mgmt.cognitiveservices.models._models_py3.Sku object at 0x7fb753ec1220>, 'system_data': <azure.mgmt.cognitiveservices.models._models_py3.SystemData object at 0x7fb753ec13d0>, 'etag': '"f0fc90f3-1a6b-4d58-924c-041e474b364f"', 'properties': <azure.mgmt.cognitiveservices.models._models_py3.DeploymentProperties object at 0x7fb753ec16d0>}
{'additional_properties': {}, 'id': '/subscriptions/99d7836d-942d-4759-80b6-2ed3b80b49e9/resourceGroups/cloud-shell-storage-eastus/providers/Microsoft.CognitiveServices/accounts/tsisodia-openai/deployments/tsisodia-gpt', 'name': 'tsisodia-gpt', 'type': 'Microsoft.CognitiveServices/accounts/deployments', 'sku': <azure.mgmt.cognitiveservices

## Classify documents with their embeddings
ref: https://github.com/openai/openai-cookbook/blob/main/examples/Classification_using_embeddings.ipynb

In [31]:
from azure.identity import DefaultAzureCredential
from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient
from openai import AzureOpenAI

client = AzureOpenAI(
  api_key = "",
  api_version = "2023-05-15",
  azure_endpoint = "https://<name>.openai.azure.com/"  
)

# Function to classify a document using Azure OpenAI
def classify_document(document_text):
    # Constructing a prompt for the model
    prompt = f"Classify the following document into one of these categories: [BUSINESS, ENTERTAINMENT, POLITICS]:\n\nDocument: {document_text}\n\nClassification:"
    
    # Sending the prompt to Azure OpenAI
    response = client.completions.create(model='tsisodia-doc-classify', prompt=prompt)
    
    # Extract the classification from the response
    classification = response.choices[0].text.strip()
    return classification

# Apply the classification to all documents in the dataset
df['predicted_label'] = df['content'].apply(classify_document)

## Save Results

In [None]:
# Save results to a file
df.to_csv('classified_documents.csv', index=False)