# Install packages

In [None]:
%pip install langchain
%pip install langchain_openai
%pip install langchain-core
%pip install langchain_community
%pip install faiss-gpu
%pip install --quiet langchain_experimental

# Set up LLM & Embedding Model

In [None]:
import json
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings

with open("../secrets/openai_secrets.json", "r") as file:
  openai_secrets = json.load(file)


llm = AzureChatOpenAI(
    azure_endpoint=openai_secrets["openai_api_base"],
    openai_api_version="2023-03-15-preview",
    deployment_name="gpt-4-32k",
    openai_api_key=openai_secrets["openai_api_key"],
    openai_api_type="azure",
    temperature=0,
)

embedding_model = AzureOpenAIEmbeddings(
    chunk_size=1,
    deployment="text-embedding-ada-002",
    openai_api_version="2023-03-15-preview",
    azure_endpoint=openai_secrets["openai_api_base"],
    openai_api_key=openai_secrets["openai_api_key"],
    openai_api_type="azure",
)

# Convert articles to document objects

In [1]:
input_file_path = '../data/task2_json/articles_with_rating.json'

In [None]:
from langchain.schema import Document

def load_articles(file_path):
  """
    Loads articles from a JSON file and converts them into Document objects.

    Args:
    - file_path (str): Path to the JSON file containing article data.

    Returns:
    - List[Document]: A list of Document objects, each representing an article with company page content and other metadata.
  """
  with open(file_path, 'r') as file:
    data = json.load(file)
  all_documents = []
  for article in data:
    for company in article['companies']:
      all_documents.append(Document(page_content=company, metadata={'title': article['title'], 'date': article['date'], 'author': article['author'], 'content': article['content']}))

  return all_documents

documents = load_articles(input_file_path)

In [None]:
print(documents[0])
print(documents[2])

page_content='NBC' metadata={'title': '‘I’m On The Horse’: Biden Defends Himself From Post-Debate Criticisms In NBC Interview', 'date': 'Jul 15, 2024,10:25pm EDT', 'author': 'Antonio Pequeño IV', 'content': "President Joe Biden said in a somewhat defensive interview with NBC’s Lester Holt on Monday he is back “on the horse” following a shaky debate performance against former President Donald Trump, expressing his confidence in his campaign for president and critiquing Trump’s rhetoric in the leadup to Election Day. Biden was interviewed on NBC Nightly News by Lester Holt. (Photo by Demetrius Freeman/The Washington ... [+] Post via Getty Images) Holt asked if Biden has a “sense of wanting to get back on the horse” following his debate performance last month, to which Biden replied, “I’m on the horse. Where have you been?” Biden cited major events he has attended in the last few weeks, saying he’s met thousands of people and spoken to “overwhelming crowds” in what he called demonstration

# Vectorise company names

In [None]:
from langchain.vectorstores import FAISS

company_vector_store = FAISS.from_documents(documents, embedding_model)

# Get unique counts of companies

In [None]:
with open(input_file_path, 'r') as file:
    data = json.load(file)
    
companies = []

for article in data:
  companies.extend(article['companies'])

In [None]:

from collections import Counter
company_counts = Counter(companies)

print(company_counts)

Counter({'Bank of Japan': 24, 'Bank of England': 19, 'Amazon': 16, 'Bundesbank': 14, 'Federal Reserve': 14, 'NBC': 13, 'Forbes': 13, 'Tesla': 12, 'Pfizer': 12, 'Hulu': 12, 'Truth Social': 8, 'RealClearPolitics': 7, 'Ford': 7, 'GM': 7, 'Stellantis': 7, 'Wood Mackenzie': 7, 'General Motors': 7, 'Disney+': 7, 'WWE': 6, 'GoFundMe': 6, 'Citadel': 6, 'Griffin Catalyst': 6, 'Citadel Securities': 6, 'Bethel Park Skilled Nursing and Rehabilitation Center': 6, 'HBO Max': 6, 'Eli Lilly': 6, 'Novo Nordisk': 6, 'Zealand Pharma': 6, 'Boehringer Ingelheim': 6, 'Viking Therapeutics': 6, 'Terns Pharmaceuticals': 6, 'Structure Therapeutics': 6, 'Altimmune': 6, 'Amgen': 6, 'Roche': 6, 'Carmot Therapeutics': 6, 'AstraZeneca': 6, 'Eccogene': 6, 'FX': 6, 'Netflix': 6, 'Disney': 6, 'ABC': 6, 'Fox': 6, 'Warner Bros. Studios': 6, 'Apple': 6, 'Warner Bros. Discovery': 6, 'Wedbush Securities': 6, 'Spirit AeroSystems': 6, 'Boeing': 6, 'Airbus': 6, 'Melius Research': 6, 'Northrop Grumman': 6, 'Gradiant': 6, 'TSMC'

# Text analysis
## Assess credit risk with RAG

1. Get the correct company name with similarity search. 
2. Combine all content across documents where the company name appears. 
3. Retrieve relevant information that is helpful to assess credit risk.
4. Get the credit risk rating and justification.

In [None]:
from langchain.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from langchain_experimental.text_splitter import SemanticChunker

class CompanyCreditRisk(BaseModel):
    credit_risk_rating: int = Field(..., ge=1, le=10, description="Credit risk rating on a scale of 1 to 10")
    justification: str = Field(..., description="Justification for the credit risk rating.")

parser = PydanticOutputParser(pydantic_object=CompanyCreditRisk)

template = """
You are an expert financial analyst specializing in credit risk assessment. Given the following passage and company name, assess the company's credit risk on a scale from 1 to 10, where 1 indicates the lowest risk and 10 indicates the highest risk. Provide a brief justification for each rating based on the content of the article. Even if the passage does not contain specific information on credit risk, based on your knowledge and context, give it a score and justification.

Passage:
{input}

Company Name:
{company_name}

Format the answer as a valid JSON object where it is a dictionary with two keys:
- "credit_risk_rating": the rating score
- "justification": text to justify the score
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["input", "company_name"],
)

chain = prompt | llm | parser

In [None]:
company_with_rating = {}
counter = 0

all_companies = list(company_counts.keys())

for company in all_companies:
  counter += 1
  print(counter, company)

  num_doc_to_retrieve = company_counts[company] # Number of documents to retrieve for the company from unique counts
  company_docs = company_vector_store.similarity_search(company, k=num_doc_to_retrieve)

  # get all document content for the company
  content_documents = ""
  for doc in company_docs:
    content = doc.metadata['content']
    title = doc.metadata['title']
    content_documents += title
    content_documents += "\n"
    content_documents += content
    content_documents += "\n"


  # chunk the document content semantically
  text_splitter = SemanticChunker(embedding_model, breakpoint_threshold_type="percentile")
  content_documents_split = text_splitter.create_documents([content_documents])


  # similarity search for document content with information related to credit risk
  content_vector_store = FAISS.from_documents(content_documents_split, embedding_model)
  query = "Retrieve text with relevant information on a company's financial performance, including financial statements, payment history, credit ratings, loan history, debt levels, and any other factors that might influence the credit risk assessment."
  relevant_docs = content_vector_store.similarity_search(query)

  relevant_info = ""
  for doc in relevant_docs:
    relevant_info += doc.page_content

  # assess risk
  result = chain.invoke({"input": relevant_info, "company_name": company})

  print(f"{result.credit_risk_rating}")

  company_with_rating[company] = (result.credit_risk_rating, result.justification)


  

1 NBC
4
2 Forbes
2
3 RealClearPolitics
1
4 Tesla
3
5 Ford
7
6 GM
6
7 Stellantis
6
8 Truth Social
3
9 Turning Point USA
3
10 Wood Mackenzie
3
11 Bachan
9
12 Prelude Growth Partners
9
13 Whole Foods
8
14 Amazon
2
15 Walmart
8
16 Sonoma Brands Capital
9
17 Facebook
8
18 McCormick
9
19 Cholula
3
20 Heinz
9
21 Tabasco
9
22 General Motors
4
23 Frigidaire
5
24 National Cash Register
5
25 Delco Electronics
5
26 WWE
2
27 GoFundMe
2
28 Citadel
1
29 Heritage Foundation
2
30 Daily Wire
3
31 Griffin Catalyst
2
32 Citadel Securities
1
33 Bethel Park Skilled Nursing and Rehabilitation Center
2
34 HBO Max
2
35 Republican National Committee
4
36 Democratic Party
4
37 UPS
3
38 Teamsters National Black Caucus
4
39 AFL-CIO
4
40 United Auto Workers
4
41 AFSCME
3
42 National Education Association
3
43 Service Employees International Union
3
44 Eli Lilly
3
45 Novo Nordisk
4
46 Zealand Pharma
6
47 Boehringer Ingelheim
5
48 Viking Therapeutics
7
49 Terns Pharmaceuticals
7
50 Structure Therapeutics
7
51 Altimmu

In [None]:
with open('../data/task3_json/company_with_rating.json', 'w', encoding='utf-8') as f:
  json.dump(company_with_rating, f, indent=4)

# Appendix - Example for 1 Company

In [None]:
query = "Roche"
docs = company_vector_store.similarity_search(query, k=6)

for doc in docs:
  print(doc.page_content)

Roche
Roche
Roche
Roche
Roche
Roche


In [None]:
content_documents = ""
for doc in docs:
  content = doc.metadata['content']
  title = doc.metadata['title']
  content_documents += title
  content_documents += "\n"
  content_documents += content
  content_documents += "\n"


print(content_documents)

Ozempic And Wegovy Rivals: Here Are The Companies Working On Competitor Weight Loss Drugs
The stellar success of blockbuster drugs like Ozempic, Wegovy, Mounjaro and Zepbound has companies eager to break into the lucrative weight loss drug market — here are the hopeful rivals preparing to challenge Eli Lilly and Novo Nordisk for dominance in the obesity drug gold rush. Drugmakers are vying for a slice of the growing weight loss drug market. Biotech Zealand Pharma is preparing to challenge Danish compatriot Novo on multiple fronts, including survodutide—an injectable it is jointly developing with Germany’s Boehringer Ingelheim that mimics the same GLP-1 gut hormone as Wegovy and Zepbound and another hormone called glucagon and has produced strong results in mid-stage clinical trials as both an anti-obesity drug and a treatment for fatty liver disease—and petrelintide, which mimics a different hunger regulating hormone, amylin, that CEO Adam Steensberg said could become “the backbone of 

In [None]:
from langchain_experimental.text_splitter import SemanticChunker

text_splitter = SemanticChunker(embedding_model, breakpoint_threshold_type="percentile")
content_documents_split = text_splitter.create_documents([content_documents])
print(content_documents_split[0].page_content)

Ozempic And Wegovy Rivals: Here Are The Companies Working On Competitor Weight Loss Drugs
The stellar success of blockbuster drugs like Ozempic, Wegovy, Mounjaro and Zepbound has companies eager to break into the lucrative weight loss drug market — here are the hopeful rivals preparing to challenge Eli Lilly and Novo Nordisk for dominance in the obesity drug gold rush. Drugmakers are vying for a slice of the growing weight loss drug market. Biotech Zealand Pharma is preparing to challenge Danish compatriot Novo on multiple fronts, including survodutide—an injectable it is jointly developing with Germany’s Boehringer Ingelheim that mimics the same GLP-1 gut hormone as Wegovy and Zepbound and another hormone called glucagon and has produced strong results in mid-stage clinical trials as both an anti-obesity drug and a treatment for fatty liver disease—and petrelintide, which mimics a different hunger regulating hormone, amylin, that CEO Adam Steensberg said could become “the backbone of 

In [None]:
print(len(content_documents_split))
print(content_documents_split[0])

13
page_content='Ozempic And Wegovy Rivals: Here Are The Companies Working On Competitor Weight Loss Drugs
The stellar success of blockbuster drugs like Ozempic, Wegovy, Mounjaro and Zepbound has companies eager to break into the lucrative weight loss drug market — here are the hopeful rivals preparing to challenge Eli Lilly and Novo Nordisk for dominance in the obesity drug gold rush. Drugmakers are vying for a slice of the growing weight loss drug market. Biotech Zealand Pharma is preparing to challenge Danish compatriot Novo on multiple fronts, including survodutide—an injectable it is jointly developing with Germany’s Boehringer Ingelheim that mimics the same GLP-1 gut hormone as Wegovy and Zepbound and another hormone called glucagon and has produced strong results in mid-stage clinical trials as both an anti-obesity drug and a treatment for fatty liver disease—and petrelintide, which mimics a different hunger regulating hormone, amylin, that CEO Adam Steensberg said could become 

In [None]:
content_vector_store = FAISS.from_documents(content_documents_split, embedding_model)

In [None]:
query = "Retrieve text with relevant information on a company's financial performance, including financial statements, payment history, credit ratings, loan history, debt levels, and any other factors that might influence the credit risk assessment."
relevant_docs = content_vector_store.similarity_search(query)

In [None]:
print(len(relevant_docs))
print(relevant_docs[0])

4
page_content='Early data on the pill suggests amycretin could outperform Wegovy. Lilly, which secured approval for Zepbound in November, is working on a suite of new drugs including orforglipron and retatrutide, which hope to build on the weight loss in Zepbound and are both in late-stage trials. It is still going to be years before Novo Nordisk and Eli Lilly have serious competition for their popular weight loss treatment on pharmacy shelves. While the process varies, it can take between 10 and 20 years to usher a new drug through all three phases of clinical trials and most drugs ultimately fail during clinical testing. There is no guarantee promising results on efficacy, safety and tolerability from earlier trials will be replicated in larger late-stage trials. Even companies with candidates already in the later stages of testing are still going to require several years to gather and process the data and, should things pan out, time to work with regulators to secure approval. Laun

In [None]:
relevant_info = ""
for doc in relevant_docs:
  relevant_info += doc.page_content



In [None]:
from langchain.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser

class CompanyCreditRisk(BaseModel):
    credit_risk_rating: int = Field(..., ge=1, le=10, description="Credit risk rating on a scale of 1 to 10")
    justification: str = Field(..., description="Justification for the credit risk rating.")

parser = PydanticOutputParser(pydantic_object=CompanyCreditRisk)

template = """
You are an expert financial analyst specializing in credit risk assessment. Given the following passage and company name, assess the company's credit risk on a scale from 1 to 10, where 1 indicates the lowest risk and 10 indicates the highest risk. Provide a brief justification for each rating based on the content of the article.

Passage:
{input}

Company Name:
{company_name}

Format the answer as a valid JSON object where it is a dictionary with two keys:
- "credit_risk_rating": the rating score
- "justification": text to justify the score
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["input", "company_name"],
)

chain = prompt | llm | parser

In [None]:
result = chain.invoke({"input": relevant_info, "company_name": "Roche"})
print(result)

credit_risk_rating=2 justification="Roche is a Swiss heavyweight in the pharmaceutical industry and has recently inked a $2.7 billion deal for weight loss drug developer Carmot Therapeutics. This indicates that the company has significant financial resources and is actively investing in promising new ventures. Furthermore, the company operates in a sector where innovation is often replaced with mergers and acquisitions, suggesting that Roche is well-positioned to capitalize on this trend. However, the pharmaceutical industry is inherently risky, with high costs and long development times for new drugs, and there is no guarantee that new drugs will be successful. Despite these risks, Roche's size, financial resources, and strategic investments suggest a relatively low credit risk."


In [None]:
result.credit_risk_rating

2