# Extract Key Information

## Set up Azure OpenAI

In [1]:
import os
import openai
from dotenv import load_dotenv

# Set up Azure OpenAI
load_dotenv("credentials.env")

openai.api_type = "azure"
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT") # Api base is the 'Endpoint' which can be found in Azure Portal where Azure OpenAI is created. It looks like https://xxxxxx.openai.azure.com/
openai.api_version ="2023-03-15-preview"
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")

In [1]:
"""from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient
from azureml.core import Workspace

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

try:
    ml_client = MLClient.from_config(credential=credential, path="workspace.json")
except Exception as ex:
    raise Exception(
        "Failed to create MLClient from config file. Please modify and then run the above cell with your AzureML Workspace details."
    ) from ex
    # ml_client = MLClient(
    #     credential=credential,
    #     subscription_id="",
    #     resource_group_name="",
    #     workspace_name=""
    # )

ws = Workspace(
    subscription_id=ml_client.subscription_id,
    resource_group=ml_client.resource_group_name,
    workspace_name=ml_client.workspace_name,
)
print(ml_client)

keyvault = ws.get_default_keyvault()
aoai_endpoint=keyvault.get_secret(name="aoai-endpoint")
aoai_key=keyvault.get_secret(name="key")"""

Found the config file in: workspace.json


MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7f87be6f89a0>,
         subscription_id=fe38c376-b42a-4741-9e7c-f5d7c31e5873,
         resource_group_name=yelizkilinc-rg,
         workspace_name=aml-prod)


In [2]:
"""import openai
# Set up Azure OpenAI
openai.api_type = "azure"
openai.api_base = aoai_endpoint # Api base is the 'Endpoint' which can be found in Azure Portal where Azure OpenAI is created. It looks like https://xxxxxx.openai.azure.com/
openai.api_version = "2023-03-15-preview"
openai.api_key = aoai_key"""

## Load Data

In [2]:
import pandas as pd

df_orig = pd.read_csv("data/bbc-news-data.csv", delimiter='\t', index_col=False)

In [4]:
df = df_orig.copy()
df

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


## Create Prompt

In [10]:
prompt_prefix = f""" 
  Extract keywords from the given context.
"""

prompt = prompt_prefix +  "### Context: " +"\n"+ df['title'].loc[0] + "\n" + df['content'].loc[0]+ "###"
prompt

' \n  Extract keywords from the given context.\n### Context: \nAd sales boost Time Warner profit\n Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.  The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.  Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service 

## Request to API

**GPT 3.5 Turbo**

In [11]:
response = openai.Completion.create(
  deployment_id="gpt-35-turbo", # has to be deployment_id
  prompt=prompt,
  temperature=0,
  max_tokens=100,
  top_p=1.0,
  frequency_penalty=0.0,
  presence_penalty=0
)

# print response
response['choices'][0]['text']

' Keywords: \nTimeWarner, profit, AOL, Google, SEC, internet\n\n### Solution:\nTimeWarner, profit, AOL, Google, SEC, internet\n\n### Explanation:\nThe given context is about the quarterly profits of TimeWarner. The company has benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which'

**GPT 4**

In [13]:
#Chat Completion Funtion
def send_message(messages, model_name, max_response_tokens=500):
    response = openai.ChatCompletion.create(
        engine=model_name,
        messages=messages,
        temperature=0.5,
        max_tokens=max_response_tokens,
        top_p=0.9,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return response['choices'][0]['message']['content']

# Defining a function to print out the conversation in a readable format
def print_conversation(messages):
    for message in messages:
        print(f"[{message['role'].upper()}]")
        print(message['content'])
        print()


In [14]:
base_system_message = "You are a helpful assistant."

system_message = f"{base_system_message.strip()}"

# This is the first user message that will be sent to the model. Feel free to update this.
user_message = prompt

# Create the list of messages. role can be either "user" or "assistant" 
messages=[
    {"role": "system", "content": system_message},
    {"role": "user", "name":"example_user", "content": user_message}
]

max_response_tokens = 1000

response = send_message(messages, "gpt-4-32k", max_response_tokens)
messages.append({"role": "assistant", "content": response})

print_conversation(messages)  

[SYSTEM]
You are a helpful assistant.

[USER]
 
  Extract keywords from the given context.
### Context: 
Ad sales boost Time Warner profit
 Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.  The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.  Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase sub

## Putting the Codes Together

In [15]:
colname = 'keywords'
results = pd.DataFrame(columns=[colname], index=df.index)

prompt_prefix = """ 
  Extract key words from this text
"""

for idx, title, content in zip(range(10), df['title'].loc[df.index.values], df['content'].loc[df.index.values]):
  
  # build prompt
  prompt = prompt_prefix + title + "\n" + content

  try:
    # Request API
    response = openai.Completion.create(
      deployment_id="gpt-35-turbo", # has to be deployment_id
      prompt=prompt,
      temperature=1,
      max_tokens=100,
      top_p=1.0,
      frequency_penalty=0.0,
      presence_penalty=1
    )

      # response
    results[colname].loc[idx] = response['choices'][0]['text']
  except Exception as err:
    idx
    print(f"Unexpected {err=}, {type(err)=}")

## Results

In [None]:
results.head(10)

## Add Results to DataFrame

In [10]:
df_results = pd.concat([df.head(10), results.head(10)], axis=1)
df_results.shape
df_results

Unnamed: 0,category,filename,title,content,keywords
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,TimeWarner's fourth quarter profits were driv...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,"Agencies \ndollar , Greenspan , euro , US , d..."
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,Rosneft recently announced plans for a multi-...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,Vocabulary:\n * pre-tax=befor taxes \n * ma...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,"""The time is ripe for further consolidation a..."
5,business,006.txt,Japan narrowly escapes recession,Japan's economy teetered on the brink of a te...,Forecasters predict that the central bank's k...
6,business,007.txt,Jobs growth still slow in the US,The US created fewer jobs than expected in Ja...,Average hourly earnings rose by 0.2% in Janua...
7,business,008.txt,India calls for fair trade rules,"India, which attends the G7 meeting of seven ...","Key Words - India, finance minister, G7 meeti..."
8,business,009.txt,Ethiopia's crop production up 24%,Ethiopia produced 14.27 million tonnes of cro...,"Source : BBC Monitoring\n\nethiopia, crop pro..."
9,business,010.txt,Court rejects $280bn tobacco case,A US government claim accusing the country's ...,"However, some members of Congress are already..."


## Save Results

In [11]:
fname = 'output/keywords.csv'
df_results.to_csv(fname, sep='\t')