# Summarise Documents

## Set up Azure OpenAI

In [3]:
import os
import openai
from dotenv import load_dotenv

# Set up Azure OpenAI
load_dotenv("credentials.env")

openai.api_type = "azure"
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT") # Api base is the 'Endpoint' which can be found in Azure Portal where Azure OpenAI is created. It looks like https://xxxxxx.openai.azure.com/
openai.api_version ="2023-03-15-preview"
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")

In [None]:
"""from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient
from azureml.core import Workspace

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

try:
    ml_client = MLClient.from_config(credential=credential, path="workspace.json")
except Exception as ex:
    raise Exception(
        "Failed to create MLClient from config file. Please modify and then run the above cell with your AzureML Workspace details."
    ) from ex
    # ml_client = MLClient(
    #     credential=credential,
    #     subscription_id="",
    #     resource_group_name="",
    #     workspace_name=""
    # )

ws = Workspace(
    subscription_id=ml_client.subscription_id,
    resource_group=ml_client.resource_group_name,
    workspace_name=ml_client.workspace_name,
)
print(ml_client)

keyvault = ws.get_default_keyvault()
aoai_endpoint=keyvault.get_secret(name="aoai-endpoint")
aoai_key=keyvault.get_secret(name="key")"""

In [2]:
"""import openai
# Set up Azure OpenAI
openai.api_type = "azure"
openai.api_base = aoai_endpoint # Api base is the 'Endpoint' which can be found in Azure Portal where Azure OpenAI is created. It looks like https://xxxxxx.openai.azure.com/
openai.api_version = "2023-03-15-preview"
openai.api_key = aoai_key"""

## Load Data

In [4]:
import pandas as pd

df_orig = pd.read_csv("data/bbc-news-data.csv", delimiter='\t', index_col=False)

In [5]:
df = df_orig.copy()
df

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


## Create prompt

In [8]:
prompt_postfix = """ 
  \n\nTl;dr
"""

prompt = df['title'].loc[0] + "\n" + df['content'].loc[0] + prompt_postfix
prompt

'Ad sales boost Time Warner profit\n Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.  The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.  Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up 

## Requst to API

** MODEL= GPT3.5 Turbo **

In [9]:
# Request API
response = openai.Completion.create(
  deployment_id="gpt-35-turbo", 
  prompt=prompt,
  temperature=1,
  max_tokens=1000,
  top_p=1.0,
  frequency_penalty=0.0,
  presence_penalty=1
)

# print response
response['choices'][0]['text']

'Time Warner\'s 4Q net income up 76% at $1.13bn beating estimates due to one-time gains and strong growth in online advertising revenue. Bullish outlook for 2005.\n \n(Source: BBC)\n\nHowever, better than expected earnings from Google sent after-hours trading soaring 5% higher.\n\n\nProject2016Energy: GooG ticker\n\nConfirmed ***fucking side ways***\n\nBilly_Retin_N: Yeah man! I could only make out the timestamp lmao. \n\nProject2016Energy: My screen looked like crap using "Live Discussion" sorting so I had to change it. My fault\n\nFolkFreendsine: Switch to best\n\nProject2016Energy: Hell yeah ima much happier now\nTho on mobile it still locked me on Live Discussion -.-\n\nFolkFreendsine: Brave or chrome?\n\nProject2016Energy: Alien Blue Pro\n\nLetJSql: ALIEN BLUE FTW!\n\nmoan2cdeeo: What??? Alien blue was discontinued over a year ago\n\nexeiiakes: Reddit bought alien blue and renamed it reddit. At least mine is im on iOS idk about Android. I know they dropped support for people that 

**MODEL= GPT4**

In [10]:
# Defining a function to send the prompt to the ChatGPT model
# More info : https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/chatgpt?pivots=programming-language-chat-completions
def send_message(messages, model_name, max_response_tokens=500):
    response = openai.ChatCompletion.create(
        engine=model_name,
        messages=messages,
        temperature=0.5,
        max_tokens=max_response_tokens,
        top_p=0.9,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return response['choices'][0]['message']['content']

# Defining a function to print out the conversation in a readable format
def print_conversation(messages):
    for message in messages:
        print(f"[{message['role'].upper()}]")
        print(message['content'])
        print()


In [11]:
base_system_message = "You are a helpful assistant."

system_message = f"{base_system_message.strip()}"

# This is the first user message that will be sent to the model. Feel free to update this.
user_message = prompt

# Create the list of messages. role can be either "user" or "assistant" 
messages=[
    {"role": "system", "content": system_message},
    {"role": "user", "name":"example_user", "content": user_message}
]

max_response_tokens = 1000

response = send_message(messages, "gpt-4-32k", max_response_tokens)
messages.append({"role": "assistant", "content": response})

print_conversation(messages)  

[SYSTEM]
You are a helpful assistant.

[USER]
Ad sales boost Time Warner profit
 Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.  The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.  Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner 

In [18]:
messages[2]['content']

"TimeWarner's quarterly profits have risen 76% to $1.13bn, largely due to increased sales of high-speed internet connections and higher advertising sales. The company also benefited from owning 8% of Google. However, its AOL division lost 464,000 subscribers in Q4, and its film division saw profits drop 27% due to box-office failures. Overall, TimeWarner's full-year profit was $3.36bn, up 27% from 2003, and revenue grew 6.4% to $42.09bn. The company expects 5% growth in operating earnings in 2005. TimeWarner is also restating its 2000 and 2003 accounts due to a US Securities Exchange Commission probe and has set aside $500m for potential legal charges."

### Putting the codes together

In [19]:
results = pd.DataFrame(columns=['summary'], index=df.index)

# prompt postifx
prompt_postfix = """ 
  \n\nTl;dr
"""

for idx, title, content in zip(range(10), df['title'].loc[df.index.values], df['content'].loc[df.index.values]):
  
  # build prompt
  prompt = title + "\n" + content + prompt_postfix

  try:
    # Request API
    response = openai.Completion.create(
      deployment_id="gpt-35-turbo", # has to be deployment_id
      prompt=prompt,
      temperature=0,
      max_tokens=100,
      top_p=1.0,
      frequency_penalty=0.0,
      presence_penalty=1
    )

      # response
    results['summary'].loc[idx] = response['choices'][0]['text']
  except Exception as err:
    idx
    print(f"Unexpected {err=}, {type(err)=}")

### Results

In [20]:
results.head(10)

Unnamed: 0,summary
0,Time Warner's Q4 2004 profits rose 76% to $1.1...
1,The dollar has hit its highest level against t...
2,Yukos' owners are asking Rosneft to repay a $9...
3,British Airways blames high fuel prices for a ...
4,Pernod Ricard is considering a takeover of All...
5,"Japan's economy grew by 0.1% in Q3, narrowly a..."
6,US created fewer jobs than expected in January...
7,"India's finance minister, Palaniappan Chidamba..."
8,Ethiopia produced 14.27 million tonnes of crop...
9,The US government's claim against tobacco comp...


### Adding results to dataframe

In [21]:
df_results = pd.concat([df.head(10), results.head(10)], axis=1)
df_results.shape
df_results

Unnamed: 0,category,filename,title,content,summary
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,Time Warner's Q4 2004 profits rose 76% to $1.1...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,The dollar has hit its highest level against t...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,Yukos' owners are asking Rosneft to repay a $9...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,British Airways blames high fuel prices for a ...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,Pernod Ricard is considering a takeover of All...
5,business,006.txt,Japan narrowly escapes recession,Japan's economy teetered on the brink of a te...,"Japan's economy grew by 0.1% in Q3, narrowly a..."
6,business,007.txt,Jobs growth still slow in the US,The US created fewer jobs than expected in Ja...,US created fewer jobs than expected in January...
7,business,008.txt,India calls for fair trade rules,"India, which attends the G7 meeting of seven ...","India's finance minister, Palaniappan Chidamba..."
8,business,009.txt,Ethiopia's crop production up 24%,Ethiopia produced 14.27 million tonnes of cro...,Ethiopia produced 14.27 million tonnes of crop...
9,business,010.txt,Court rejects $280bn tobacco case,A US government claim accusing the country's ...,The US government's claim against tobacco comp...


## Save results

In [22]:
fname = 'output/summaries.csv'
df_results.to_csv(fname, sep='\t')