In [2]:
from pymongo import MongoClient
#from sklearn.neighbors import KDTree
import numpy as np
from dotenv import load_dotenv
import os
from openai import OpenAI
load_dotenv("../../vars.env")
openai_api = os.environ.get("OPENAI_API")
uri = os.environ.get("MONGODB_URI")


def fetch_embeddings_from_mongo():
    client = MongoClient(uri)
    try:
        client.admin.command('ping')
        print("Pinged your deployment. You successfully connected to MongoDB!")
    except Exception as e:
        print(e)
    db = client["news"]
    collection = db["articles"]

    embeddings = []
    urls = set()
    for doc in collection.find({}):
        # Assuming each document has an 'embedding' field
        if len(doc['semantic_embedding']) != 1536:
            print(doc['url'])
        if doc['url'] not in urls:
            urls.add(doc['url'])
            embeddings.append([doc['semantic_embedding'], doc['url']])
    return embeddings



def find_closest_article(embedding, article_embeddings):
    closest_dist = -1.1
    closest_url = False
    for other_embed in article_embeddings:
        if not closest_url or similarity_score(embedding, other_embed[0], verbose=False) > closest_dist:
            closest_dist = similarity_score(embedding, other_embed[0], verbose=False)
            closest_url = other_embed[1]

    return closest_url



def find_closest_k_articles(embedding, article_embeddings, k):
    top_articles = []

    for other_embed in article_embeddings:
        dist = similarity_score(embedding, other_embed[0], verbose=False)
        top_articles.append((dist, other_embed[1]))
    
    # Sort the articles by their similarity score in descending order
    top_articles.sort(reverse=True, key=lambda x: x[0])

    # Select the top k articles
    top_k_articles = [article for _, article in top_articles[:k]]
    top_k_scores = [score for score, _ in top_articles[:k]]

    return top_k_articles, top_k_scores



def get_embedding(text, engine = 'text-embedding-ada-002'):
    """
    Get the embedding for the given text using OpenAI's Embedding API.

    :param text: The text to embed.
    :param engine: The embedding engine to use.
    :return: Embedding vector.
    """
    client = OpenAI(
        #  This is the default and can be omitted
        api_key=openai_api,
    )

    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=engine).data[0].embedding



def similarity_score(x, y, verbose = True):
    x = np.array(x)
    y = np.array(y)
    sim_score = x.T@y / (np.linalg.norm(x) * np.linalg.norm(y))
    if verbose:
        print(f"Similarity between the two embeddings is: {sim_score:.4f}")
    return sim_score



def find_related_articles(query, k):
    query_embedding = get_embedding(query)
    article_urls, sim_scores = find_closest_k_articles(query_embedding, article_embeddings, k)
    for (url, score) in zip(article_urls,sim_scores):
        print(f"URL: {url} , SCORE: {score}")



article_embeddings = fetch_embeddings_from_mongo()


Pinged your deployment. You successfully connected to MongoDB!


In [5]:
query = "The economy"
query_embedding = get_embedding(query)

u = find_closest_article(query_embedding, article_embeddings)
print(u)

https://www.msn.com/en-us/money/markets/this-record-breaking-market-just-keeps-going-higher-and-higher-heres-why/ar-BB1gXTTG


In [6]:
query = "The economy"
find_related_articles(query, 5)

URL: https://www.msn.com/en-us/money/markets/this-record-breaking-market-just-keeps-going-higher-and-higher-heres-why/ar-BB1gXTTG , SCORE: 0.8154835962439331
URL: https://www.msn.com/en-us/money/companies/as-kansas-leaders-tout-successful-year-in-economic-development-workers-are-too-few/ar-AA1mgCmG , SCORE: 0.8101414616747808
URL: https://www.msn.com/en-us/money/markets/s-p-500-closes-at-a-new-all-time-high-as-fresh-data-drives-optimism-for-rate-cuts/ar-BB1gXT2g , SCORE: 0.8095715629212235
URL: https://www.msn.com/en-us/money/markets/top-economist-steve-hanke-says-stocks-will-drop-a-recession-will-hit-and-inflation-will-sink-below-2-this-year/ar-AA1naUjZ , SCORE: 0.8033216799882013
URL: https://www.msn.com/en-us/money/markets/morning-bid-americas-china-chill-ahead-of-holiday-retail-readout/ar-AA1n91mI , SCORE: 0.8032214782009546


In [7]:
query = "Secretary of State Anthony Blinken"
find_related_articles(query, 5)

URL: https://www.msn.com/en-us/news/world/blinken-senior-chinese-official-discuss-n-korea-taiwan-russias-war-in-ukraine/ar-AA1mTpit , SCORE: 0.86952086087249
URL: https://www.msn.com/en-us/news/world/blinken-meets-with-netanyahu-in-hopes-of-stemming-israel-gaza-conflict-s-spread/ar-AA1mFSV7 , SCORE: 0.8542217001010897
URL: https://www.msn.com/en-us/news/world/israel-hamas-war-blinken-heads-to-middle-east-for-gaza-talks/ar-AA1mv3G8 , SCORE: 0.8395089521868522
URL: https://www.msn.com/en-us/news/world/blinken-on-diplomatic-push-in-israel-as-it-says-gaza-war-to-continue-through-2024/ar-AA1mFbuW , SCORE: 0.8386108620424871
URL: https://www.msn.com/en-us/news/world/blinken-promises-ukraine-enduring-us-support-in-war-with-russia/ar-AA1n4GIW , SCORE: 0.8351010974565222


In [8]:
query = "The fall of the West"
find_related_articles(query, 5)

URL: https://www.msn.com/en-us/news/opinion/israel-s-war-on-gaza-and-the-west-s-credibility-crisis/ar-AA1n3HOO , SCORE: 0.809326834478905
URL: https://www.msn.com/en-us/news/world/ending-us-war-aid-will-bring-a-big-crisis-worse-than-just-a-weakened-ukraine-zelenskyy-warns/ar-AA1n8UWk , SCORE: 0.800653842531181
URL: https://www.msn.com/en-us/news/world/russia-failed-to-capitalize-on-victories-as-ukraine-war-hits-stalemate-uk/ar-AA1n5b4G , SCORE: 0.8004979420269253
URL: https://www.msn.com/en-us/news/world/russian-state-tv-pundit-warns-life-keeps-getting-worse-amid-ukraine-war/ar-AA1mXrPW , SCORE: 0.8000145406880936
URL: https://www.msn.com/en-us/news/world/our-enemies-will-vanish-an-up-close-look-at-the-russia-ukraine-war-review/ar-AA1mvJGa , SCORE: 0.796763040962815


In [9]:
query = "Sports"
find_related_articles(query, 10)

URL: https://www.msn.com/en-us/foodanddrink/recipes/38-game-day-potluck-recipes/ar-AA1j0Dd0 , SCORE: 0.7936452971400206
URL: https://www.msn.com/en-us/sports/soccer/asian-cup-holds-moments-silence-for-israel-hamas-war-victims-ahead-of-palestinian-teams-game/ar-AA1mXqV2 , SCORE: 0.7909492692723754
URL: https://www.msn.com/en-us/sports/other/arizona-digest-2024-wnba-all-star-tickets-on-sale-jan-30-for-phoenix-event/ar-AA19RWb9 , SCORE: 0.7821994963269334
URL: https://www.msn.com/en-us/news/world/part-of-that-war-machine-hurting-my-country-and-my-people-says-ukrainian-tennis-star-of-russian-and-belarusian-opponents/ar-BB1gX8Zl , SCORE: 0.7786099994427168
URL: https://www.msn.com/en-us/news/world/turkey-detains-israeli-soccer-player-for-displaying-gaza-war-message/ar-AA1mZphV , SCORE: 0.7773383727820247
URL: https://www.msn.com/en-us/news/world/reuters-news-schedule-at-1000-pm-gmt600-am-sgt/ar-BB1gVN7x , SCORE: 0.7752082868783389
URL: https://www.msn.com/en-us/news/us/after-year-of-culture

In [10]:
query = "If Donald Trump wins in 2024, what will happen to Israel and Gaza? In particular, how will American involvement change?"
find_related_articles(query, 10)

URL: https://www.msn.com/en-us/news/politics/8-questions-for-the-start-of-what-will-be-a-turbulent-election-2024/ar-AA1mCGtP , SCORE: 0.8340842021721607
URL: https://www.msn.com/en-us/news/opinion/ending-israels-gaza-operation-is-also-the-surest-way-to-avoid-a-regional-war/ar-AA1n8Du7 , SCORE: 0.8273386940975127
URL: https://www.msn.com/en-us/news/politics/trump-vs-biden-former-president-retakes-lead-in-2024-election-poll-but-sees-lower-support-from-independent-voters/ar-AA1n9gy8 , SCORE: 0.8264759484855435
URL: https://www.msn.com/en-us/news/politics/2024-presidential-election-race-will-be-unlike-any-other-strategists-say/ar-AA1mjV9l , SCORE: 0.8215862416954768
URL: https://www.msn.com/en-us/news/world/after-100-days-israel-hamas-war-threatens-to-spill-beyond-gaza-disrupt-global-trade/ar-AA1mW1ZP , SCORE: 0.8184613228697036
URL: https://www.msn.com/en-us/news/politics/with-asa-hutchinson-out-of-the-race-these-are-the-major-2024-presidential-candidates/ar-AA17x678 , SCORE: 0.8180530792

In [11]:
query = "The War in the Middle East and its consequences on the global economy"
find_related_articles(query, 3)

URL: https://www.msn.com/en-us/money/markets/qatari-finance-minister-says-gaza-war-to-slow-middle-east-economies/ar-AA1nb9MM , SCORE: 0.8338022737901251
URL: https://www.msn.com/en-us/news/world/after-100-days-israel-hamas-war-threatens-to-spill-beyond-gaza-disrupt-global-trade/ar-AA1mW1ZP , SCORE: 0.8251484015762895
URL: https://www.msn.com/en-us/news/world/iran-says-attacks-by-its-allies-won-t-stop-until-israel-s-war-in-gaza-ends/ar-AA1n8TN2 , SCORE: 0.8217586811895186


In [12]:
query = "MIT Media Lab"
find_related_articles(query, 1)

URL: https://www.msn.com/en-us/news/world/reuters-news-schedule-at-1000-pm-gmt600-am-sgt/ar-BB1gVN7x , SCORE: 0.7719512504008126


In [13]:
query = "How many Russians have died in the war in Ukraine so far?"
find_related_articles(query, 3)

URL: https://www.msn.com/en-us/news/world/russias-losses-in-ukraine-as-of-january-18-around-800-troops-and-over-40-uavs/ar-AA1nasDg , SCORE: 0.8784565004048006
URL: https://www.msn.com/en-us/news/world/russias-intense-attacks-on-ukraine-has-sharply-increased-civilian-casualties-in-december-un-says/ar-AA1n5IMR , SCORE: 0.863747088928059
URL: https://www.msn.com/en-us/news/world/thursday-january-4-russia-s-war-on-ukraine-news-and-information-from-ukraine/ar-AA1mudbf , SCORE: 0.8633812131288521


In [14]:
query = "News"
find_related_articles(query, 3)

URL: https://www.msn.com/en-us/news/world/reuters-news-schedule-at-1000-pm-gmt600-am-sgt/ar-BB1gVN7x , SCORE: 0.8184959288543778
URL: https://www.msn.com/en-us/news/world/global-impact-earthquake-plane-collision-not-best-start-to-year-for-japan-as-it-faces-pressure-at-home-and-abroad/ar-AA1mYQyU , SCORE: 0.7908025578868985
URL: https://www.msn.com/en-us/news/world/global-impact-earthquake-plane-collision-not-best-start-to-year-for-japan-as-it-faces-pressure-at-home-and-abroad/ar-AA1mZ7MS , SCORE: 0.7908025578868985


In [4]:
query = "Current Conflict Details: Can you specify which conflict in the Middle East you are referring to? The region has experienced various conflicts, and each has its unique implications on the global economy."
find_related_articles(query, 1)

URL: https://www.msn.com/en-us/money/markets/qatari-finance-minister-says-gaza-war-to-slow-middle-east-economies/ar-AA1nb9MM , SCORE: 0.7991254674677261


In [6]:
query = "Global Economic Trends: What are the major global economic trends as of January 2024? This includes any significant shifts in global markets, major economic downturns or booms, and the status of international trade relations."
find_related_articles(query, 1)

URL: https://www.msn.com/en-us/money/markets/morning-bid-americas-china-chill-ahead-of-holiday-retail-readout/ar-AA1n91mI , SCORE: 0.8235217032419531


In [7]:
query = "Technological Advancements: What significant technological advancements or breakthroughs have occurred by January 2025? This can include developments in AI, biotechnology, space exploration, or any other field that has had a substantial impact on society."
find_related_articles(query, 1)

URL: https://www.msn.com/en-us/money/markets/banner-year-for-us-stock-market-in-2023-top-performing-stocks-from-diverse-sectors/ar-AA1mclbZ , SCORE: 0.8024571062683201


In [8]:
query = "Political and Social Climate: What is the political and social climate like in major world regions, including any significant conflicts, alliances, or social movements that have emerged?"
find_related_articles(query, 1)

URL: https://www.msn.com/en-us/news/world/global-impact-earthquake-plane-collision-not-best-start-to-year-for-japan-as-it-faces-pressure-at-home-and-abroad/ar-AA1mYQyU , SCORE: 0.7910561687787119


In [5]:
""""
You are Nostradamus' protege. I am going to ask you a hypothetical question about things 
you were not trained on and you will hypothesize an answer based on your vast knowledge base.
However, in order to aid your answer, when I give you the question, you must provide 
exactly 5 Google search prompts that, given the results, will best help you answer the 
original question. Do not elaborate as to why you chose these questions, and separate 
each question by a semicolon. The questions should be filled with keywords that will help you
generate the best responses to your queries. I recommend that if you do not know the
context for the question, that you use questions for this. The question is as follows:

What if the Houthis in Yemen retaliate against the American for their strikes?


1. "Houthi capabilities for retaliation against US"; 
2. "Historical examples of Houthi retaliation"; 
3. "Impact of US strikes on Yemen"; 
4. "US military presence in the Middle East 2024"; 
5. "Recent Houthi statements on US policy".
"""
query =  "Possibilities of Houthi allies\' support in retaliation"
print(f"Finding articles similar to query: {query}")
print(find_related_articles(query, 3))

Finding articles similar to query: Possibilities of Houthi allies' support in retaliation
URL: https://www.msn.com/en-us/news/world/israel-gaza-war-live-updates-us-carries-out-new-strike-on-houthi-radar-site-israel-speaks-at-icj/ar-AA1mQHLo , SCORE: 0.8431067032603126
URL: https://www.msn.com/en-us/news/world/israel-gaza-war-live-updates-us-hits-houthi-sites-in-yemen-israel-steps-up-west-bank-raids/ar-AA1na07t , SCORE: 0.8297850625621296
URL: https://www.msn.com/en-us/news/world/iran-says-attacks-by-its-allies-won-t-stop-until-israel-s-war-in-gaza-ends/ar-AA1n8TN2 , SCORE: 0.8279648668673546
None


In [11]:
import sys
import re
import requests
from bs4 import BeautifulSoup



def fetch_article_contents(article_id):
    '''
    Returns article's author and contents of the article
    '''
    asset_url = "https://assets.msn.com/content/view/v2/Detail/en-us/" + article_id

    try:
        response = requests.get(asset_url)
        response.raise_for_status()
        data = response.json()
        html_content = data.get('body', 'No content found')

    except requests.RequestException as e:
        print(f"Error fetching article: {e}")
        return None
    
    if data.get('authors', False):
        author = data.get('authors', 'None')[0]['name']
    else:
        author = 'Not found'
    
    soup = BeautifulSoup(html_content, 'lxml')
    paragraphs = [p.get_text(separator=' ', strip=True) for p in soup.find_all('p')]
    #print(author)
    return author, '\n\n'.join(paragraphs)




def fetch_article_id(article_url):        
    pattern = re.compile(r'/ar-([A-Za-z0-9]+)')

    match = pattern.search(article_url)
    if match:
        article_id = match.group(1)
        return article_id
    else:
        raise Exception("No article ID found")
    
fetch_article_contents(fetch_article_id("https://www.msn.com/en-us/news/world/israel-gaza-war-live-updates-us-hits-houthi-sites-in-yemen-israel-steps-up-west-bank-raids/ar-AA1na07t"))

('Frances Vinall, Adela Suliman, Miriam Berger, Adam Taylor, Karen DeYoung, Sammy Westfall',
 'This live coverage has ended. For the latest updates, please go here .\n\nThe United States fired its second round of strikes in less than 24 hours against Houthi militants in Yemen on Thursday morning, targeting “a couple of anti-ship missiles that we had reason to believe were being prepared for imminent fire into the southern Red Sea,” National Security Council spokesman John Kirby said. Israel stepped up raids in the occupied West Bank on Thursday, saying security forces had been operating in the Tulkarm refugee camp for over 35 hours.\n\n8:38 PM: U.S. launches new round of strikes against Yemen’s Houthis; Houthis attack another ship\n\nThe United States fired another round of strikes Thursday morning against Houthi militants in Yemen , targeting “a couple of anti-ship missiles that we had reason to believe were being prepared for imminent fire into the southern Red Sea,” National Securit

In [12]:
fetch_article_contents(fetch_article_id("https://www.msn.com/en-us/news/world/israel-gaza-war-live-updates-us-carries-out-new-strike-on-houthi-radar-site-israel-speaks-at-icj/ar-AA1mQHLo"))

('Andrew Jeong, Frances Vinall, Victoria Bisset, Emily Rauhala, Adam Taylor',

In [13]:
fetch_article_contents(fetch_article_id("https://www.msn.com/en-us/news/world/israel-gaza-war-live-updates-medicine-for-hostages-enters-gaza-strip-as-part-of-aid-deal-qatar-says-houthis-to-return-to-us-terror-watchlist/ar-AA1n6rGc"))

('Rachel Pannett, Adela Suliman, Shira Rubin, Bryan Pietsch, Sammy Westfall',
 'This live coverage has ended. For the latest updates, please go here .\n\nThe U.S. Navy launched a new wave of missile strikes against Houthi militants in Yemen, U.S. officials said late Wednesday. The attacks targeted about a dozen sites, making the latest round the largest by the U.S. military since President Biden last week approved dozens of strikes in a single night. Qatar said a shipment of medicine and aid had entered Gaza as part of a deal that would transfer medicine to Israeli hostages in exchange for aid to civilians in Gaza.\n\n11:13 PM: Aunt of former hostage accuses Netanyahu of scuttling hostage releases\n\nThe great-aunt of a freed Hamas hostage on Wednesday rebuked Israeli Prime Minister Benjamin Netanyahu for allowing more than 130 remaining hostages to languish in Hamas captivity in Gaza, accusing him of scuttling deals that would have seen more hostages freed for the sake of advancing hi