In [12]:
from bs4 import BeautifulSoup
import requests
import sqlite3
import random
from sentence_transformers import SentenceTransformer
from numpy import dot
from numpy.linalg import norm
import time
import numpy as np
BASE_URL = 'https://en.wikipedia.org'
model = SentenceTransformer("all-MiniLM-L6-v2")
session = requests.Session()

session.headers.update({
    "User-Agent": "WikiSpeedrun/1.0",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
})


def get_html_content(url):
    try:
        response = session.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"An error has occured {e}")
        return None

In [13]:
def get_links_and_titles(soup):
    all_links_element = soup.find_all("a")
    links = []
    titles = []
    for a in all_links_element:
        if a.has_attr("href") and a['href'].startswith("/wiki/") and not a['href'].startswith("/wiki/Special:") and not a['href'].startswith("/wiki/Help:") and not a['href'].startswith("/wiki/Wikipedia:") and not a['href'].startswith("/wiki/Category:") and not a['href'].startswith("/wiki/File:") and not a['href'].startswith("/wiki/Template:") and not a['href'].startswith("/wiki/Portal:"):
            links.append(BASE_URL + a["href"])
            titles.append(a.get("title", "No title"))
    return titles, links

In [14]:
start_url = "https://en.wikipedia.org/wiki/Computer_programming"
end_url = "https://en.wikipedia.org/wiki/Potato"

html = get_html_content(start_url)
soup = BeautifulSoup(html, 'html.parser')
end_title = end_url.split("/wiki/")[-1].replace("_", " ")
titles, links = get_links_and_titles(soup)
start_time = time.time()
end_embed = model.encode([end_title], convert_to_numpy=True)[0]

embeds = model.encode(titles, convert_to_numpy=True)          
cosine_similarities = [dot(embed, end_embed)/(norm(embed)*norm(end_embed)) for embed in embeds]

end_time = time.time()
print(f"Time taken to compute embeddings and similarities: {end_time - start_time} seconds")

Time taken to compute embeddings and similarities: 0.23340511322021484 seconds


In [21]:
start_url = "https://en.wikipedia.org/wiki/Computer_programming"
end_url = "https://en.wikipedia.org/wiki/Potato"

html = get_html_content(start_url)
soup = BeautifulSoup(html, 'html.parser')
end_title = end_url.split("/wiki/")[-1].replace("_", " ")
titles, links = get_links_and_titles(soup)
start_time = time.time()


end_embed = model.encode([end_title], convert_to_numpy=True)[0].astype(np.float32)
embeds = model.encode(titles, convert_to_numpy=True).astype(np.float32)

eps = 1e-12

# 1️⃣ Replace any NaN/inf in embeddings BEFORE normalization
end_embed = np.nan_to_num(end_embed, nan=0.0, posinf=0.0, neginf=0.0)
embeds = np.nan_to_num(embeds, nan=0.0, posinf=0.0, neginf=0.0)

# 2️⃣ Normalize (safe)
end_norm = end_embed / (np.linalg.norm(end_embed) + eps)
embeds_norm = embeds / (np.linalg.norm(embeds, axis=1, keepdims=True) + eps)



# 4️⃣ FAST cosine similarity
cosine_similarities = embeds_norm @ end_norm

end_time = time.time()
print(f"Time taken to compute embeddings and similarities: {end_time - start_time} seconds")

Time taken to compute embeddings and similarities: 0.2331089973449707 seconds


  cosine_similarities = embeds_norm @ end_norm
  cosine_similarities = embeds_norm @ end_norm
  cosine_similarities = embeds_norm @ end_norm


In [20]:
#see if end_embed has nan or inf
if np.isnan(end_embed).any() or np.isinf(end_embed).any():
    print("end_embed has NaN or Inf values")
if np.isnan(embeds).any() or np.isinf(embeds).any():
    print("embeds has NaN or Inf values")