In [4]:
from pymongo import MongoClient
from sklearn.neighbors import KDTree
import numpy as np


from dotenv import load_dotenv
import os
load_dotenv("../vars.env")



def fetch_embeddings_from_mongo():
    uri = os.environ.get("MONGODB_URI")
    client = MongoClient(uri)
    try:
        client.admin.command('ping')
        print("Pinged your deployment. You successfully connected to MongoDB!")
    except Exception as e:
        print(e)
    db = client["news"]
    collection = db["articles"]

    embeddings = []
    urls = set()
    for doc in collection.find({}):
        # Assuming each document has an 'embedding' field
        if len(doc['semantic_embedding']) != 1536:
            print(doc['url'])
        if doc['url'] not in urls:
            urls.add(doc['url'])
            embeddings.append([doc['semantic_embedding'], doc['url']])
    return embeddings



def build_kd_tree(embeddings):
    return KDTree(embeddings, leaf_size=40, metric='')



def find_closest_article(embedding, article_embeddings):
    closest_dist = -1.1
    closest_url = False
    for other_embed in article_embeddings:
        if not closest_url or similarity_score(embedding, other_embed[0], verbose=False) > closest_dist:
            closest_dist = similarity_score(embedding, other_embed[0], verbose=False)
            closest_url = other_embed[1]

    return closest_url



def find_closest_k_articles(embedding, article_embeddings, k):
    top_articles = []

    for other_embed in article_embeddings:
        dist = similarity_score(embedding, other_embed[0], verbose=False)
        top_articles.append((dist, other_embed[1]))
    
    # Sort the articles by their similarity score in descending order
    top_articles.sort(reverse=True, key=lambda x: x[0])

    # Select the top k articles
    top_k_articles = [article for _, article in top_articles[:k]]
    top_k_scores = [score for score, _ in top_articles[:k]]

    return top_k_articles, top_k_scores



from openai import OpenAI
openai_api = os.environ.get("OPENAI_API")

def get_embedding(text, engine = 'text-embedding-ada-002'):
    """
    Get the embedding for the given text using OpenAI's Embedding API.

    :param text: The text to embed.
    :param engine: The embedding engine to use.
    :return: Embedding vector.
    """
    client = OpenAI(
        #  This is the default and can be omitted
        api_key=openai_api,
    )

    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=engine).data[0].embedding



def similarity_score(x, y, verbose = True):
    x = np.array(x)
    y = np.array(y)
    sim_score = x.T@y / (np.linalg.norm(x) * np.linalg.norm(y))
    if verbose:
        print(f"Similarity between the two embeddings is: {sim_score:.4f}")
    return sim_score



def find_related_articles(query, k):
    query_embedding = get_embedding(query)
    article_urls, sim_scores = find_closest_k_articles(query_embedding, article_embeddings, k)
    for (url, score) in zip(article_urls,sim_scores):
        print(f"URL: {url} , SCORE: {score}")



article_embeddings = fetch_embeddings_from_mongo()


Pinged your deployment. You successfully connected to MongoDB!


In [5]:
query = "The economy"
query_embedding = get_embedding(query)

u = find_closest_article(query_embedding, article_embeddings)
print(u)

https://www.msn.com/en-us/money/markets/this-record-breaking-market-just-keeps-going-higher-and-higher-heres-why/ar-BB1gXTTG


In [6]:
query = "The economy"
find_related_articles(query, 5)

URL: https://www.msn.com/en-us/money/markets/this-record-breaking-market-just-keeps-going-higher-and-higher-heres-why/ar-BB1gXTTG , SCORE: 0.8154835962439331
URL: https://www.msn.com/en-us/money/companies/as-kansas-leaders-tout-successful-year-in-economic-development-workers-are-too-few/ar-AA1mgCmG , SCORE: 0.8101414616747808
URL: https://www.msn.com/en-us/money/markets/s-p-500-closes-at-a-new-all-time-high-as-fresh-data-drives-optimism-for-rate-cuts/ar-BB1gXT2g , SCORE: 0.8095715629212235
URL: https://www.msn.com/en-us/money/markets/top-economist-steve-hanke-says-stocks-will-drop-a-recession-will-hit-and-inflation-will-sink-below-2-this-year/ar-AA1naUjZ , SCORE: 0.8033216799882013
URL: https://www.msn.com/en-us/money/markets/morning-bid-americas-china-chill-ahead-of-holiday-retail-readout/ar-AA1n91mI , SCORE: 0.8032214782009546


In [7]:
query = "Secretary of State Anthony Blinken"
find_related_articles(query, 5)

URL: https://www.msn.com/en-us/news/world/blinken-senior-chinese-official-discuss-n-korea-taiwan-russias-war-in-ukraine/ar-AA1mTpit , SCORE: 0.86952086087249
URL: https://www.msn.com/en-us/news/world/blinken-meets-with-netanyahu-in-hopes-of-stemming-israel-gaza-conflict-s-spread/ar-AA1mFSV7 , SCORE: 0.8542217001010897
URL: https://www.msn.com/en-us/news/world/israel-hamas-war-blinken-heads-to-middle-east-for-gaza-talks/ar-AA1mv3G8 , SCORE: 0.8395089521868522
URL: https://www.msn.com/en-us/news/world/blinken-on-diplomatic-push-in-israel-as-it-says-gaza-war-to-continue-through-2024/ar-AA1mFbuW , SCORE: 0.8386108620424871
URL: https://www.msn.com/en-us/news/world/blinken-promises-ukraine-enduring-us-support-in-war-with-russia/ar-AA1n4GIW , SCORE: 0.8351010974565222


In [8]:
query = "The fall of the West"
find_related_articles(query, 5)

URL: https://www.msn.com/en-us/news/opinion/israel-s-war-on-gaza-and-the-west-s-credibility-crisis/ar-AA1n3HOO , SCORE: 0.809326834478905
URL: https://www.msn.com/en-us/news/world/ending-us-war-aid-will-bring-a-big-crisis-worse-than-just-a-weakened-ukraine-zelenskyy-warns/ar-AA1n8UWk , SCORE: 0.800653842531181
URL: https://www.msn.com/en-us/news/world/russia-failed-to-capitalize-on-victories-as-ukraine-war-hits-stalemate-uk/ar-AA1n5b4G , SCORE: 0.8004979420269253
URL: https://www.msn.com/en-us/news/world/russian-state-tv-pundit-warns-life-keeps-getting-worse-amid-ukraine-war/ar-AA1mXrPW , SCORE: 0.8000145406880936
URL: https://www.msn.com/en-us/news/world/our-enemies-will-vanish-an-up-close-look-at-the-russia-ukraine-war-review/ar-AA1mvJGa , SCORE: 0.796763040962815


In [9]:
query = "Sports"
find_related_articles(query, 10)

URL: https://www.msn.com/en-us/foodanddrink/recipes/38-game-day-potluck-recipes/ar-AA1j0Dd0 , SCORE: 0.7936452971400206
URL: https://www.msn.com/en-us/sports/soccer/asian-cup-holds-moments-silence-for-israel-hamas-war-victims-ahead-of-palestinian-teams-game/ar-AA1mXqV2 , SCORE: 0.7909492692723754
URL: https://www.msn.com/en-us/sports/other/arizona-digest-2024-wnba-all-star-tickets-on-sale-jan-30-for-phoenix-event/ar-AA19RWb9 , SCORE: 0.7821994963269334
URL: https://www.msn.com/en-us/news/world/part-of-that-war-machine-hurting-my-country-and-my-people-says-ukrainian-tennis-star-of-russian-and-belarusian-opponents/ar-BB1gX8Zl , SCORE: 0.7786099994427168
URL: https://www.msn.com/en-us/news/world/turkey-detains-israeli-soccer-player-for-displaying-gaza-war-message/ar-AA1mZphV , SCORE: 0.7773383727820247
URL: https://www.msn.com/en-us/news/world/reuters-news-schedule-at-1000-pm-gmt600-am-sgt/ar-BB1gVN7x , SCORE: 0.7752082868783389
URL: https://www.msn.com/en-us/news/us/after-year-of-culture

In [10]:
query = "If Donald Trump wins in 2024, what will happen to Israel and Gaza? In particular, how will American involvement change?"
find_related_articles(query, 10)

URL: https://www.msn.com/en-us/news/politics/8-questions-for-the-start-of-what-will-be-a-turbulent-election-2024/ar-AA1mCGtP , SCORE: 0.8340842021721607
URL: https://www.msn.com/en-us/news/opinion/ending-israels-gaza-operation-is-also-the-surest-way-to-avoid-a-regional-war/ar-AA1n8Du7 , SCORE: 0.8273386940975127
URL: https://www.msn.com/en-us/news/politics/trump-vs-biden-former-president-retakes-lead-in-2024-election-poll-but-sees-lower-support-from-independent-voters/ar-AA1n9gy8 , SCORE: 0.8264759484855435
URL: https://www.msn.com/en-us/news/politics/2024-presidential-election-race-will-be-unlike-any-other-strategists-say/ar-AA1mjV9l , SCORE: 0.8215862416954768
URL: https://www.msn.com/en-us/news/world/after-100-days-israel-hamas-war-threatens-to-spill-beyond-gaza-disrupt-global-trade/ar-AA1mW1ZP , SCORE: 0.8184613228697036
URL: https://www.msn.com/en-us/news/politics/with-asa-hutchinson-out-of-the-race-these-are-the-major-2024-presidential-candidates/ar-AA17x678 , SCORE: 0.8180530792

In [11]:
query = "The War in the Middle East and its consequences on the global economy"
find_related_articles(query, 3)

URL: https://www.msn.com/en-us/money/markets/qatari-finance-minister-says-gaza-war-to-slow-middle-east-economies/ar-AA1nb9MM , SCORE: 0.8338022737901251
URL: https://www.msn.com/en-us/news/world/after-100-days-israel-hamas-war-threatens-to-spill-beyond-gaza-disrupt-global-trade/ar-AA1mW1ZP , SCORE: 0.8251484015762895
URL: https://www.msn.com/en-us/news/world/iran-says-attacks-by-its-allies-won-t-stop-until-israel-s-war-in-gaza-ends/ar-AA1n8TN2 , SCORE: 0.8217586811895186


In [12]:
query = "MIT Media Lab"
find_related_articles(query, 1)

URL: https://www.msn.com/en-us/news/world/reuters-news-schedule-at-1000-pm-gmt600-am-sgt/ar-BB1gVN7x , SCORE: 0.7719512504008126


In [13]:
query = "How many Russians have died in the war in Ukraine so far?"
find_related_articles(query, 3)

URL: https://www.msn.com/en-us/news/world/russias-losses-in-ukraine-as-of-january-18-around-800-troops-and-over-40-uavs/ar-AA1nasDg , SCORE: 0.8784565004048006
URL: https://www.msn.com/en-us/news/world/russias-intense-attacks-on-ukraine-has-sharply-increased-civilian-casualties-in-december-un-says/ar-AA1n5IMR , SCORE: 0.863747088928059
URL: https://www.msn.com/en-us/news/world/thursday-january-4-russia-s-war-on-ukraine-news-and-information-from-ukraine/ar-AA1mudbf , SCORE: 0.8633812131288521


In [14]:
query = "News"
find_related_articles(query, 3)

URL: https://www.msn.com/en-us/news/world/reuters-news-schedule-at-1000-pm-gmt600-am-sgt/ar-BB1gVN7x , SCORE: 0.8184959288543778
URL: https://www.msn.com/en-us/news/world/global-impact-earthquake-plane-collision-not-best-start-to-year-for-japan-as-it-faces-pressure-at-home-and-abroad/ar-AA1mYQyU , SCORE: 0.7908025578868985
URL: https://www.msn.com/en-us/news/world/global-impact-earthquake-plane-collision-not-best-start-to-year-for-japan-as-it-faces-pressure-at-home-and-abroad/ar-AA1mZ7MS , SCORE: 0.7908025578868985
