In [12]:
from datasets import load_dataset, concatenate_datasets, DatasetDict

subjects = [
    "algebra",
    "counting_and_probability",
    "geometry",
    "intermediate_algebra",
    "number_theory",
    "prealgebra",
    "precalculus"
]

datasets = [load_dataset("EleutherAI/hendrycks_math", subject) for subject in subjects]

train_datasets = [ds["train"] for ds in datasets]
test_datasets = [ds["test"] for ds in datasets]

combined_train = concatenate_datasets(train_datasets)
combined_test = concatenate_datasets(test_datasets)

combined_dataset = DatasetDict({
    "train": combined_train,
    "test": combined_test
})


combined_dataset["train"] = combined_dataset["train"].shuffle(seed=42)
combined_dataset["test"] = combined_dataset["test"].shuffle(seed=42)
combined_dataset["train"][0]


{'problem': 'What is the number of units in the distance between $(2,5)$ and $(-6,-1)$?',
 'level': 'Level 2',
 'type': 'Algebra',
 'solution': 'We use the distance formula: $\\sqrt{(-6 - 2)^2 + (-1 - 5)^2},$ so then we find that $\\sqrt{64 + 36} = \\boxed{10}$.\n\n- OR -\n\nWe note that the points $(2, 5)$, $(-6, -1)$, and $(2, -1)$ form a right triangle with legs of length 6 and 8. This is a Pythagorean triple, so the length of the hypotenuse must be $\\boxed{10}$.'}

In [57]:
import wikipedia

topics = [
    "Algebra", "Calculus", "Derivative", "Integral",
    "Matrix (mathematics)", "Probability", "Statistics",
    "Geometry", "Trigonometry", "Number theory", "linear algebra", "Linear Regression"
]

wiki_corpus = []

for topic in topics:
    try:
        summary = wikipedia.page(topic, auto_suggest=False).content
        wiki_corpus.append({"title": topic, "content": summary})
        print(f"Collected: {topic}")
    except Exception as e:
        print(f"Skipped {topic}: {e}")

print(len(wiki_corpus), "articles collected.")


Collected: Algebra
Collected: Calculus
Collected: Derivative
Collected: Integral
Collected: Matrix (mathematics)
Collected: Probability
Collected: Statistics
Collected: Geometry
Collected: Trigonometry
Collected: Number theory
Collected: linear algebra
Collected: Linear Regression
12 articles collected.


In [58]:
wiki_corpus[2]

{'title': 'Derivative',
 'content': 'In mathematics, the derivative is a fundamental tool that quantifies the sensitivity to change of a function\'s output with respect to its input. The derivative of a function of a single variable at a chosen input value, when it exists, is the slope of the tangent line to the graph of the function at that point. The tangent line is the best linear approximation of the function near that input value. The derivative is often described as the instantaneous rate of change, the ratio of the instantaneous change in the dependent variable to that of the independent variable. The process of finding a derivative is called differentiation.\nThere are multiple different notations for differentiation. Leibniz notation, named after Gottfried Wilhelm Leibniz, is represented as the ratio of two differentials, whereas prime notation is written by adding a prime mark. Higher order notations represent repeated differentiation, and they are usually denoted in Leibniz 

In [67]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

sentences = [word_tokenize(doc["content"].lower()) for doc in wiki_corpus]

c = Word2Vec(
    sentences,
    vector_size=200,
    window=5,
    min_count=3,
    workers=4,
    sg=1 
)

def doc_vector(model, text):
    words = [w for w in word_tokenize(text.lower()) if w in model.wv]
    if not words:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words], axis=0)

doc_vectors = np.array([doc_vector(c, doc["content"]) for doc in wiki_corpus])
titles = [doc["title"] for doc in wiki_corpus]
contents = [doc["content"] for doc in wiki_corpus]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/wangyuning/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [60]:
import faiss

d = c.vector_size 
index = faiss.IndexFlatL2(d)
index.add(doc_vectors.astype('float32'))

print(f"Indexed {index.ntotal} documents.")


Indexed 12 documents.


In [76]:
query = "cosine"
query_vec = doc_vector(c, query).astype('float32')

k = 5 
D, I = index.search(np.array([query_vec]), k)

for i, idx in enumerate(I[0]):
    print(f"Rank {i+1}: {titles[idx]} (distance={D[0][i]:.4f})")


Rank 1: Probability (distance=0.1671)
Rank 2: Trigonometry (distance=0.1720)
Rank 3: Number theory (distance=0.1768)
Rank 4: Integral (distance=0.2261)
Rank 5: linear algebra (distance=0.2279)
