In [191]:
from langchain_ollama import OllamaLLM
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


In [204]:
# This notebook does experiment, first intention is to read news, 
# then use llm to filter out the unwanted content and display the filtered contents.  
# second intention is to test out json output using https://python.langchain.com/docs/concepts/structured_outputs/
#    Get news feed from free api like https://newsapi.org and developer api-key is free for study purpose see their documentation to see the json format. 
# Take a look at the output using commandline tool curl. using tee command so it can show both at standard output and write to a file at the same time 
#  ```
# curl -s "https://newsapi.org/v2/top-headlines?country=us&apiKey=<useYourOwnAPIKey> 2>&1| tee 02022025news.json
# ```
# 1. Read the news from a local file as source data, this can be replaced with a real API call to get news as show above, TODO to use the api here.
import json

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return json.dumps(data, indent=4)

file_path = './data/news/02022025news.json'
input_json_articles = read_json_file(file_path)
#print(input_json_articles)


In [205]:
#Define json schema
outputJsonSchema = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "properties": {
        "articles": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "title": {
                        "type": "string"
                    },
                    "url": {
                        "type": "string"
                    },
                    "urlToImage": {
                        "type": "string"
                    },
                    "publishedAt": {
                        "type": "string"
                    }
                },
                "required": ["title", "url", "urlToImage", "publishedAt"]
            }
        }
    },
    "required": ["articles"]
}



In [206]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model="llama3.2:latest",
    temperature=0,
    format=outputJsonSchema
)


In [207]:
# 3. Create the prompt template

from langchain.prompts import PromptTemplate

template = """
Answer the question based only on the context provided.

Context: {context}

Question: {question}
"""
prompt = PromptTemplate.from_template(template)
# 4. Build the chain of operations
chain = prompt | llm|StrOutputParser()


In [208]:
question="can you print out all the articles in the context except that the content field contains: natural disaster like earthquick, wildfile, accident like plane crash, train crash, crime, and war-related topics, and celebrity news?"

In [198]:
resp = chain.invoke(
    {
        "context": input_json_articles,
        "question": question,
    }
)

In [209]:
print(resp)

{"articles": [{"title": "Tech stocks have worst week in months after 'nobody saw DeepSeek coming' - MarketWatch", "url": "https://www.marketwatch.com/story/tech-stocks-see-worst-week-in-months-after-nobody-saw-deepseek-coming-amid-ai-mania-4de0101c", "urlToImage": "https://images.mktw.net/im-48393816/social", "publishedAt": "2025-02-01T01:12:00Z"}, {"title": "Heat\u2019s Terry Rozier declines to discuss federal gambling probe - New York Post ", "url": "https://nypost.com/2025/01/31/betting/terry-rozier-declines-to-talk-gambling-probe/", "urlToImage": "https://nypost.com/wp-content/uploads/sites/2/2025/01/2194241155.jpg?quality=75&strip=all&w=1024", "publishedAt": "2025-02-01T00:58:00Z"}, {"title": "College basketball picks, schedule: Predictions for Tennessee vs. Florida and more Top 25 games for Saturday - CBS Sports", "url": "https://www.cbssports.com/college-basketball/news/college-basketball-picks-schedule-predictions-for-tennessee-vs-florida-and-more-top-25-games-for-saturday/", "

In [210]:
# parse the output to html so we can display in our notebook to see it
def parse_news(json_string):
    data = json.loads(json_string)
    articles = data.get('articles', [])
    html_output = "<ul>"
    for article in articles:
        title = article.get('title')
        url = article.get('url')
        urlToImage = article.get('urlToImage')
        publishedAt = article.get('publishedAt')
        html_output += f"<li><h2>{title}</h2>"
        html_output += f"<p><a href='{url}'>{url}</a></p>"
        image = f"<img width='250' height='250' src='{urlToImage}' alt='Image for {title}' />"
        html_output += image
        html_output += f"<p>Published At: {publishedAt}</p></li>"
    html_output += "</ul>"
    return html_output
output_html=parse_news(resp)
#print(output_html)

In [201]:
from IPython.core.display import HTML
HTML(output_html)