In [1]:
import requests
import os

In [2]:


# Define the API endpoint and API key
url = "https://newsapi.org/v2/everything"


# Define the query parameters
params = {
    "q": "finance",
    "apiKey": os.environ.get("NEWS_API_KEY"),
}

# Make the GET request
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()  # Parse the response as JSON
    articles = data.get("articles", [])  # Extract the articles list
    
    list_news = []
    
    # Extract and print the required fields from each article
    for article in articles:
        try:
            news_object = {
                "title": article.get("title") or "",
                "description": article.get("description") or "",
                "url": article.get("url") or "",
                "publishedAt": article.get("publishedAt") or "",
                "content": article.get("content") or "",
            }
            
            content = article.get("content") or ""
            title = article.get("title") or ""
            url = article.get("url") or ""
            description = article.get("description") or ""
            published_at = article.get("publishedAt") or ""
            list_news.append(news_object)
            
        except Exception as e:
            print(f"An error occurred: {e}")
else:
    print(f"Failed to fetch articles. Status code: {response.status_code}")


In [3]:
import csv

# Define the CSV file name
csv_file = "news_articles.csv"

# Open the CSV file for writing
with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["title", "description", "url", "publishedAt", "content"])
    writer.writeheader()  # Write the header row
    writer.writerows(list_news)  # Write the articles

print(f"CSV file '{csv_file}' has been created successfully.")

CSV file 'news_articles.csv' has been created successfully.


In [4]:
import pandas as pd

dataset = pd.read_csv("news_articles.csv")
dataset.sample(5)

Unnamed: 0,title,description,url,publishedAt,content
37,Stripe CFO joins the board of $3 billion AI st...,Steffan Tomlinson has experience taking tech s...,https://www.businessinsider.com/stripe-cfo-ste...,2024-12-17T16:01:01Z,Steffan Tomlinson (right) joined Vercel's boar...
74,Francis Ford Coppola Picks His Favorite Criter...,Upon stepping into the hallowed Criterion Clos...,https://www.openculture.com/?p=1121369,2024-12-26T09:00:54Z,Upon step­ping into the hal­lowed Cri­te­ri­on...
80,Canada Prime Minister Justin Trudeau to Step D...,Canadian Prime Minister Justin Trudeau is step...,https://time.com/7205012/justin-trudeau-resign...,2025-01-06T16:16:48Z,Canadian Prime Minister Justin Trudeau is step...
86,AI model for near-instant image creation on co...,Comments,https://www.surrey.ac.uk/news/surrey-announces...,2024-12-10T16:44:52Z,The technology is available immediately throug...
34,Canada's Justin Trudeau says he will resign as...,Trudeau's decision after almost a decade in po...,https://www.npr.org/2025/01/06/nx-s1-5249766/j...,2025-01-06T16:22:31Z,Canadian Prime Minister Justin Trudeau announc...


In [33]:
dataset.isnull().sum()

dataset.dropna(inplace=True)

In [34]:
dataset = dataset[~dataset.apply(lambda row: row.astype(str).str.contains('\[Removed\]').any(), axis=1)]
dataset.reset_index(drop=True, inplace=True)
print(dataset)

                                                title  \
0       Car loan scandal payout fears as row drags on   
1   The 5 best US cities to celebrate New Year's E...   
2   Dave Ramsey's 2 tips as people prepare to spen...   
3   Canadian Man Crowned as King of Spreadsheets a...   
4   Canada's finance minister resigns, posing bigg...   
..                                                ...   
89  Act Now to Lock In Up to 4.70% APY. Today's CD...   
90  Apple @ Work: The future of the Mac in the wor...   
91  Canada finance minister resigns as PM Trudeau ...   
92  Elon Musk Says U.S. Needs More Power – But Ris...   
93  4 Financial Habits To Break After Age 50 If Yo...   

                                          description  \
0   People who were unaware of commission paid whe...   
1   The personal finance website WalletHub compare...   
2   Dave Ramsey, a personal finance star, advised ...   
3   Guess you could say this guy really excels at ...   
4   The stunning move raised q

  dataset = dataset[~dataset.apply(lambda row: row.astype(str).str.contains('\[Removed\]').any(), axis=1)]


https://api.smith.langchain.com


In [35]:
# Define the CSV file name
csv_file = "news_articles.csv"

# Open the CSV file for writing
with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=dataset.columns)
    writer.writeheader()  # Write the header row
    writer.writerows(dataset.to_dict('records'))  # Write the data

print(f"CSV file '{csv_file}' has been created successfully.")

CSV file 'news_articles.csv' has been created successfully.


In [16]:
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = os.environ.get("LANGCHAIN_ENDPOINT")
os.environ["LANGCHAIN_API_KEY"] = os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = os.environ.get("LANGCHAIN_PROJECT")
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")


In [23]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [24]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

In [25]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

In [36]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever(search_type="similarity", top_k=5)
# prompt = hub.pull("rlm/rag-prompt")

# def format_docs(docs):
#     return "\n\n".join(doc.page_content for doc in docs)


# rag_chain = (
#     {"context": retriever | format_docs, "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )

retrieved_docs = retriever.invoke("finance")
print(len(retrieved_docs))

4


355
