In [1]:
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient

credential = DefaultAzureCredential()
client = SecretClient(vault_url="https://scienceenginea4747056219.vault.azure.net/", credential=credential)
# aoai_endpoint = client.get_secret("OpenAIAPIEndpointSCUS").value
# aoai_secret = client.get_secret("OpenAIAPIKeySCUS").value
aoai_endpoint = client.get_secret("OpenAIAPIEndpointEUS").value
aoai_secret = client.get_secret("OpenAIAPIKeyEUS").value

ModuleNotFoundError: No module named 'azure.profiles'

In [173]:
import json
import os
import time
from typing import List
from wasabi import msg
from time import sleep

import numpy as np
import requests
import pandas as pd

from urllib.request import urlopen
from bs4 import BeautifulSoup
import openai
import gzip
from transformers import GPT2TokenizerFast
from urllib.request import Request

# API keys and endpoints
BING_API_KEY = os.environ["BING_SEARCH_V7_SUBSCRIPTION_KEY"]
BING_ENDPOINT = "https://api.bing.microsoft.com/v7.0/news/search"
# openai api from azure
openai.api_type = "azure"
openai.api_base = aoai_endpoint
openai.api_version = "2022-12-01"
openai.api_key = aoai_secret
TOKENIZER = GPT2TokenizerFast.from_pretrained("gpt2")
ALLOWED_TEXT_TOKEN_LENGTH = 2500
GENERATION_TOKEN_LENGTH = 4096 - ALLOWED_TEXT_TOKEN_LENGTH

PROMPTS = {
    "topics": "Main news topic in a list: ", 
    "sentiment": "From this article, the overall sentiment is: ", 
    "entities": "Find the entities in a list: ", 
    "regulations": "From this article, is there any new rules or changes in regulations in a list: "
    }

FEW_SHOT_PROMPT_FOR_RELATIONS = {
    "demonstration": 
        """text: There is still some uncertainty that Musk - 
        also chief executive of electric car maker Tesla and rocket company SpaceX - 
        will pull off his planned buyout. \n relations: Tesla, Musk, top_members/employees; 
        Musk, Tesla, employee_of; Musk, SpaceX, employee_of; SpaceX, Musk, top_members/employees \n text: """,
    "prompt": "relations: "}

def encode_and_decode_text(text):
    encoded_text = TOKENIZER.encode(
        text=text,
        max_length=ALLOWED_TEXT_TOKEN_LENGTH,
        truncation=True,
        add_special_tokens=True,
        padding=False,
    )
    return TOKENIZER.decode(encoded_text, skip_special_tokens=False)
    
def query_bing_news(query: str, freshness: str) -> dict:
    # Construct a request
    mkt = "en-US"
    params = {"q": query, "mkt": mkt, "count": 100, "freshness": freshness}
    headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY}
    
    # freshness has to be one of the 3 defined keys
    # if not, we use Week instead
    if freshness not in ["Month", "Week", "Day"]:
        raise UserWarning(f"Key 'freshness' is expected to be one of[Month, Week, Day], got '{freshness}' instead.")

    # Call the API
    try:
        response = requests.get(BING_ENDPOINT, headers=headers, params=params)
        response.raise_for_status()
        return response.json()
    except Exception as ex:
        raise ex
    
def count_words(input_string):
    words = input_string.split()
    return len(words)

def select_first_n_words(input_string, top_n_words=3500):
    words = input_string.split()
    return " ".join(words[:top_n_words])

def extract_main_text_from_webpage(url):
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    res = urlopen(req)
    content = res.read()
    # check if the site is using gzip for compression
    if res.headers["Content-Encoding"] == "gzip":
        content = gzip.decompress(content)
        
    soup = BeautifulSoup(content, features="html.parser", from_encoding="iso-8859-1")
    # print(soup)
    # remove all script and style elements
    [s.extract() for s in soup(["script", "style"])]

    # get text and join all chunks into a single string
    text = ' '.join(soup.stripped_strings)

    return text

def query_openai_api(text, prompt):
    response = openai.Completion.create(
        engine="aiarch-text-davinci-003",
        # engine="aiarch-text-davinci-002",
        prompt=f"{prompt} \n {encode_and_decode_text(text)} \n",
        temperature=0.9,
        max_tokens=GENERATION_TOKEN_LENGTH - len(TOKENIZER.encode(text=prompt,max_length=None,truncation=True,add_special_tokens=True,padding=False)) - 1,
        top_p=0.5,
        frequency_penalty=0,
        presence_penalty=0,
        best_of=1,
        stop=None)
    text = "\n".join([item for item in response["choices"][0]["text"].split("\n") if item != ""])
    return text

def query_openai_api_with_demonstration(text, prompt, demonstration):
    response = openai.Completion.create(
        engine="aiarch-text-davinci-003",
        # engine="aiarch-text-davinci-002",
        prompt=f"{demonstration} {encode_and_decode_text(text)} \n {prompt}",
        temperature=0.9,
        max_tokens=GENERATION_TOKEN_LENGTH - len(TOKENIZER.encode(text=f"{demonstration} {encode_and_decode_text(text)} \n {prompt}",max_length=None,truncation=True,add_special_tokens=True,padding=False)) - 1,
        top_p=0.5,
        frequency_penalty=0,
        presence_penalty=0,
        best_of=1,
        stop=None)
    return response["choices"][0]["text"]

In [167]:
res = query_openai_api_with_demonstration("Biden wants to drive down drug costs. Price controls won't work.", 
                                    FEW_SHOT_PROMPT_FOR_RELATIONS["prompt"], 
                                    FEW_SHOT_PROMPT_FOR_RELATIONS["demonstration"])
res

" Biden, drug costs, wants_to_lower; drug costs, price controls, won't_work"

In [174]:
def news_intelligence(topic: str, freshness: str, top_n=10):
    news = query_bing_news(topic, freshness=freshness)["value"][:top_n]
    msg.good(f"Found {len(news)} news articles for topic '{topic}'")
    # print(news)
    for i in range(len(news)):
        record = news[i]
        msg.info(f"Processing article {i+1}/{len(news)}")
        # print(record["url"])
        extracted_text = extract_main_text_from_webpage(record["url"])
        msg.info(f"Extracted text: {extracted_text[:20]}...")
        msg.info(f"Extracted {count_words(extracted_text)} words from article {i+1}/{len(news)}")
        # print(count_words(extracted_text))
        # print("extracted_text", extracted_text)
        for key, prompt in PROMPTS.items():
            msg.info(f"Querying OpenAI for '{key}'")
            try:
                record[key] = query_openai_api(extracted_text, prompt)
            except Exception as e:
                msg.fail(f"Error querying OpenAI for '{key}': {e}")
                record[key] = None
        # query the few shot example
        try:
            msg.info(f"Querying OpenAI for 'extracted_relations'")
            record["extracted_relations"] = query_openai_api_with_demonstration(extracted_text, 
                                                                                FEW_SHOT_PROMPT_FOR_RELATIONS["prompt"], 
                                                                                FEW_SHOT_PROMPT_FOR_RELATIONS["demonstration"])
        except Exception as e:
            msg.fail(f"Error querying OpenAI for 'extracted_relations': {e}")
            record["extracted_relations"] = None
        msg.good(f"Querying OpenAI successful")
    return news

In [169]:
# extract_main_text_from_webpage("https://firstwordpharma.com/story/5708013")

In [182]:
NEWS_QUERY = "pharma companies"

In [183]:
res = news_intelligence(NEWS_QUERY, freshness="Week", top_n=10)

[38;5;2m✔ Found 10 news articles for topic 'pharma companies'[0m
[38;5;4mℹ Processing article 1/10[0m
[38;5;4mℹ Extracted text: Biden wants to drive...[0m
[38;5;4mℹ Extracted 1109 words from article 1/10[0m
[38;5;4mℹ Querying OpenAI for 'topics'[0m
[38;5;4mℹ Querying OpenAI for 'sentiment'[0m
[38;5;4mℹ Querying OpenAI for 'entities'[0m
[38;5;4mℹ Querying OpenAI for 'regulations'[0m
[38;5;4mℹ Querying OpenAI for 'extracted_relations'[0m
[38;5;2m✔ Querying OpenAI successful[0m
[38;5;4mℹ Processing article 2/10[0m
[38;5;4mℹ Extracted text: Executive chairman o...[0m
[38;5;4mℹ Extracted 272 words from article 2/10[0m
[38;5;4mℹ Querying OpenAI for 'topics'[0m
[38;5;4mℹ Querying OpenAI for 'sentiment'[0m
[38;5;4mℹ Querying OpenAI for 'entities'[0m
[38;5;4mℹ Querying OpenAI for 'regulations'[0m
[38;5;4mℹ Querying OpenAI for 'extracted_relations'[0m
[38;5;2m✔ Querying OpenAI successful[0m
[38;5;4mℹ Processing article 3/10[0m
[38;5;4mℹ Extracted text: Le

In [192]:
df_res = pd.DataFrame(res)
df_res

Unnamed: 0,name,url,image,description,about,provider,datePublished,video,category,topics,sentiment,entities,regulations,extracted_relations
0,"$130,000 for medicine is outrageous. Blame the...",https://www.usatoday.com/story/opinion/columni...,{'thumbnail': {'contentUrl': 'https://www.bing...,"A lot of people, including President Joe Biden...",[{'readLink': 'https://api.bing.microsoft.com/...,"[{'_type': 'Organization', 'name': 'YAHOO!News...",2023-02-21T11:00:00.0000000Z,"{'name': 'Pfizer, French partner Valneva begin...",Health,1. Biden wants to drive down drug costs\n2. Pr...,The overall sentiment of this article is that ...,"Biden, Price Controls, Come Meet Us, Time to L...","No, there are no new rules or changes in regul...","Biden, drug_costs, wants_to_lower; Biden, pri..."
1,Executive chairman of Spanish pharma company G...,https://www.msn.com/en-gb/health/other/executi...,{'thumbnail': {'contentUrl': 'https://www.bing...,MADRID (Reuters) - Steven F. Mayer has resigne...,[{'readLink': 'https://api.bing.microsoft.com/...,"[{'_type': 'Organization', 'name': 'MSN'}]",2023-02-21T17:15:00.0000000Z,,Business,1. Executive chairman of Spanish pharma compan...,The overall sentiment of this article is neutral.,Entities: \n1. Steven F. Mayer \n2. Thomas Gla...,"No, there are no new rules or changes in regul...","Grifols, Steven F. Mayer, top_members/employe..."
2,Lexington County pharmaceutical company furlou...,https://www.postandcourier.com/columbia/busine...,{'thumbnail': {'contentUrl': 'https://www.bing...,A March 2022 inspection found issues with how ...,[{'readLink': 'https://api.bing.microsoft.com/...,"[{'_type': 'Organization', 'name': 'Post and C...",2023-02-20T23:10:00.0000000Z,,Business,1. Lexington County pharmaceutical company fur...,The overall sentiment of this article is negat...,1. Lexington County pharmaceutical company \n2...,"No, there are no new rules or changes in regul...","\nNephron Pharmaceuticals Corporation, employe..."
3,Sanders targets pharma companies over vaccine ...,https://www.msn.com/en-us/news/other/sanders-t...,{'thumbnail': {'contentUrl': 'https://www.bing...,Vermont Sen. Bernie Sanders took aim at pharma...,,"[{'_type': 'Organization', 'name': 'MSN'}]",2023-02-19T16:12:00.0000000Z,,,1. Sanders targets pharma companies over vacci...,The overall sentiment is one of criticism of p...,"Entities: \nSanders, Moderna, Stéphane Bancel,...","No, there are no new rules or changes in regul...","Sanders, Moderna, calls_for_testimony; \n ..."
4,Sales incentives worth ₹165.74 crore released ...,https://www.thehindu.com/business/sales-incent...,{'thumbnail': {'contentUrl': 'https://www.bing...,Four pharmaceutical companies have received th...,[{'readLink': 'https://api.bing.microsoft.com/...,"[{'_type': 'Organization', 'name': 'The Hindu'}]",2023-02-22T00:47:00.0000000Z,,,1. Sales incentives worth â¹165.74 crore rele...,"The overall sentiment is positive, as the arti...",Entities: \n-Dr. Reddy’s Laboratories Limited ...,"No, there are no new rules or changes in regul...","Department for Pharmaceuticals, four pharma c..."
5,Pharma Regulatory Management Systems Market Si...,https://www.marketwatch.com/press-release/phar...,,Pharma Regulatory Management Systems Market 20...,,"[{'_type': 'Organization', 'name': 'MarketWatc...",2023-02-20T23:30:00.0000000Z,,,1. Main news topic: Pharma Regulatory Manageme...,The overall sentiment of this article is posit...,Entities: \n- EXTEDO \n- NNIT \n- Instem (Sama...,"No, there are no new rules or changes in regul...",\n Pharma Regulatory Management Systems Market...
6,Pharmaceutical Equipment Market 2023: A Status...,https://www.marketwatch.com/press-release/phar...,,"Feb 21, 2023 (The Expresswire) -- ""Final Repor...",[{'readLink': 'https://api.bing.microsoft.com/...,"[{'_type': 'Organization', 'name': 'MarketWatc...",2023-02-21T08:24:00.0000000Z,,Business,Main News Topic in a List:\n1. Overview of the...,The overall sentiment of this article is posit...,"Entities: \nGE Healthcare, Siemens, Bosch, Sar...","No, there are no new rules or changes in regul...","Pharmaceutical Equipment, GE Healthcare, manu..."
7,Kobach goes after second pharmaceutical compan...,https://fox4kc.com/news/kobach-goes-after-seco...,{'thumbnail': {'contentUrl': 'https://www.bing...,Following a “significant victory for the pro-l...,[{'readLink': 'https://api.bing.microsoft.com/...,"[{'_type': 'Organization', 'name': 'WDAF-TV', ...",2023-02-21T21:26:00.0000000Z,,Health,1. Attorney General Kris Kobach puts pressure ...,The overall sentiment of this article is that ...,"Entities: \nKobach, CVS, Walgreens, Kansas, At...","No, there are no new rules or changes in regul...","\n Kobach, Walgreens, warned; Walgreens, Kobac..."
8,Analysts Offer Insights on Healthcare Companie...,https://markets.businessinsider.com/news/stock...,,There’s a lot to be optimistic about in the He...,,"[{'_type': 'Organization', 'name': 'Business I...",2023-02-21T16:30:00.0000000Z,,ScienceAndTechnology,1. Analysts Offer Insights on Healthcare Compa...,"The overall sentiment is optimistic, as analys...","Entities: Ligand Pharma (LGND), Seagen (SGEN),...","No, there are no new rules or changes in regul...","Ligand Pharma, Benchmark Co., analyst; Seagen..."
9,Pharmaceutical Company's Expansion to Frederic...,https://www.westword.com/news/agilent-expansio...,{'thumbnail': {'contentUrl': 'https://www.bing...,"Agilent Technologies chose Frederick, Colorado...",[{'readLink': 'https://api.bing.microsoft.com/...,"[{'_type': 'Organization', 'name': 'Westword',...",2023-02-20T12:00:00.0000000Z,,Business,Main News Topic:\n1. Pharmaceutical Company's ...,The overall sentiment of this article is posit...,"Entities: \nAgilent Technologies, Colorado Off...","No, there are no new rules or changes in regul...","Agilent Technologies, Brian Carothers, employ..."


In [185]:
df_res.to_csv(f"res_{NEWS_QUERY.replace(' ', '_')}.csv", index=False)

In [186]:
df_res["topics"].values.tolist()[0].split("\n")

['1. Biden wants to drive down drug costs',
 "2. Price controls won't work",
 '3. Pharmaceutical industry blamed for high drug costs',
 '4. FDA regulations drive up costs',
 '5. Hatch-Waxman Act allows drugmakers to maintain monopoly',
 '6. Medicare and Medicaid share blame for high prices',
 '7. Price controls discourage research and development',
 '8. Price controls have unintended consequences',
 '9. Price controls are popular but not effective',
 '10. Government intervention into free market not the answer']

In [187]:
df_res["extracted_relations"].values.tolist()[0].split("\n")

[' Biden, drug_costs, wants_to_lower; Biden, price_controls, does_not_support; FDA, drug_costs, increases; FDA, generic_drugs, approval_process; Hatch-Waxman_Act, drug_costs, increases; Medicare, drug_costs, increases; Medicaid, drug_costs, increases; Pharmaceutical_Industry, drug_costs, increases; government, drug_costs, increases; Johnson_&_Johnson, Prezista, 167_protections; Gilead, Truvada, 120_protections; Gilead, Viread, 118_protections; Biden, price_controls, supports; market, drug_costs, balances_supply_and_demand; Venezuela, price_controls, failure; India, price_controls, failure; US, pharmaceutical_innovation, leads; Canada, pharmaceutical_innovation, low_rank; health_care_industry, regulations, most_regulated; television_set_industry, regulations, not_overburdened; Biden, price_controls, popularity; market, investments, encourages; market, product_quality, inferior; market, illegal_markets, encourages; market, shortages, encourages; market, rationing, encourages; market, inn

In [188]:
df_res["entities"].values.tolist()[0].split("\n")

['Biden, Price Controls, Come Meet Us, Time to Legalize Weed?, U.S., Sports, Entertainment, Life, Money, Tech, Travel, Opinion, ONLY AT USA TODAY, Newsletters, For Subscribers, From the Archives, Support Local, Crossword, eNewspaper, Magazines, Investigations, Weather Forecast, Podcasts, Video, Humankind, Just Curious, Pet Health, Reviewed, Coupons, Pharmaceutical Industry, $130,000, medicine, government, pharmaceutical companies, President Biden, free market, FDA, Food and Drug Administration, generic drugs, Hatch-Waxman Act, patent, intellectual property, Chris Schlak, USA TODAY, family, multiple sclerosis, prescriptions, MS LifeLines, charity, Rebif, United States, corporate greed, State of the Union address, government intervention, tariffs, Johnson & Johnson, Janssen Global HIV, Center for Innovation, University of California College of the Law, San Francisco, Gilead, Truvada, Viread, Medicare, Medicaid, monopoly, pill coating, evergreening, Bureaucrats, pain, patients, Richard Ow

In [189]:
def graph_aggregation(graphs):
    res = query_openai_api("\n".join([g for g in graphs]), prompt="extract the relations from this:")
    return res

In [190]:
# graph_str = graph_aggregation(df_res["extracted_relations"])

In [191]:
# graph_str.split("\n")