Load previously downloaded Wikipedia docs.

In [11]:
import pickle
with open("docs.pickle", "rb") as file:
    docs = pickle.load(file)
print(f"{len(docs)} documents")

288 documents


Split all docs to make them fit as context (or input) of a local llm.

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
splits = text_splitter.split_documents(docs)
print(f"{len(splits)} splits")

1699 splits


Create embedding vectors for all splits using some Ollama-served llm.

In [29]:
from tqdm.notebook import tqdm
from langchain_community.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="llama3")
vecs = []
for split in tqdm(splits):
    vecs.append(embeddings.embed_documents([split])[0])
print(f"embedding space dim: {len(vecs[0])}")

  0%|          | 0/1699 [00:00<?, ?it/s]

embedding space dim: 4096


In [30]:
with open("vecs.pickle", "wb") as file:
    pickle.dump(vecs, file)

In [89]:
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

#clusters = DBSCAN(eps=.5, min_samples=3).fit(vecs)
clusters = KMeans(n_clusters=10).fit(vecs)
labels = clusters.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 10
Estimated number of noise points: 0


In [90]:
print([l for l in labels if l > 0])

[6, 6, 8, 3, 3, 6, 3, 3, 6, 8, 7, 5, 8, 6, 6, 9, 6, 7, 8, 7, 6, 7, 6, 9, 8, 9, 9, 6, 6, 6, 6, 4, 7, 3, 6, 5, 7, 9, 8, 7, 8, 8, 8, 1, 9, 8, 3, 7, 5, 8, 3, 6, 4, 6, 3, 5, 4, 4, 5, 9, 9, 1, 6, 5, 8, 5, 6, 7, 9, 5, 9, 5, 6, 8, 8, 9, 6, 6, 8, 8, 7, 9, 7, 9, 8, 9, 3, 3, 3, 5, 5, 6, 9, 5, 9, 7, 7, 7, 5, 6, 7, 3, 9, 4, 7, 3, 3, 5, 7, 6, 6, 1, 7, 4, 7, 3, 3, 7, 6, 3, 3, 7, 7, 3, 3, 5, 4, 3, 4, 6, 3, 4, 7, 7, 5, 7, 7, 7, 9, 6, 7, 9, 7, 7, 7, 5, 8, 4, 7, 4, 6, 4, 4, 6, 9, 6, 9, 7, 6, 6, 7, 3, 6, 3, 5, 7, 7, 7, 7, 6, 2, 6, 9, 8, 6, 8, 6, 4, 7, 3, 4, 3, 7, 8, 4, 3, 3, 9, 7, 7, 9, 9, 6, 9, 3, 7, 7, 6, 7, 6, 7, 3, 4, 9, 1, 1, 1, 1, 9, 6, 6, 7, 7, 3, 5, 4, 7, 3, 7, 3, 3, 7, 7, 1, 7, 9, 4, 4, 7, 7, 7, 6, 8, 7, 6, 4, 7, 5, 9, 1, 4, 2, 4, 3, 6, 6, 9, 4, 4, 7, 7, 4, 5, 9, 6, 9, 9, 9, 9, 9, 9, 6, 9, 9, 9, 8, 7, 8, 9, 4, 7, 5, 9, 8, 3, 4, 4, 4, 7, 9, 7, 9, 8, 6, 6, 7, 7, 9, 6, 7, 7, 5, 6, 9, 9, 3, 3, 7, 7, 7, 7, 9, 1, 9, 3, 7, 7, 7, 3, 7, 4, 4, 6, 4, 4, 7, 9, 3, 3, 7, 3, 3, 6, 4, 7, 7, 7, 9, 4, 4, 4, 3, 4, 

In [62]:
# get back texts for labeled vecs stored in vector store
import chromadb

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="wikidocs")
ids=[str(i) for i in range(len(splits))]
collection.add(
    documents=[d.page_content for d in splits],
    embeddings=vecs,
    metadatas=[d.metadata for d in splits],
    ids=ids
)
print(f"{collection.count()} docs added to Chroma")

1699 docs added to Chroma


In [92]:
# retrieve docs
unique_labels = set([l for l in labels if l > 0])
query_embeddings = []
for label in unique_labels:
    ix = list(labels).index(label)
    query_embeddings.append(vecs[ix])

In [105]:
representatives = collection.query(
    query_embeddings=query_embeddings,
    n_results=5,
    include=["documents"]
)

5


In [107]:
# ask LLM for single term/tag
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

llm = Ollama(model="llama3")
prompt = ChatPromptTemplate.from_messages([
    ("system", "Summarize in maximum three words. No other output."),
    ("user", "{input}")
])
output_parser = StrOutputParser()
chain = prompt | llm | output_parser

for response in representatives["documents"]:
    for text in response:
        print(chain.invoke({ "input": text }))

Phenomenology Origin
Naturalism History
Phenomenal Understanding
Reality Study
Islamic Philosophy
Logic Definition
Logic Summary
Correct Argument Form
Formal Logic
Logic Argument
Ancient Greek Thinkers
Pythagoras Philosophy
Epistemology & Ontology
Sophist thinkers relativism
Ancient Computing Tools
Doctor of Philosophy
Love of Wisdom
PhD Research Degree
PhD Degree Requirements
Social Science Field
Professionalization Process
Philosophy Professionalization
Social Science Roots
Psychology Integration
New philosophers.
Love of wisdom
Word Origin
Philosophy Basics
Indian Philosophical
Reality Study
System: Three-Word Summary: "Process Started"
A definition
Launch Successful
Launchpad
Three-word summary: Definition Library
Western Philosophy
Political Science
AI Model Collapse
Music Genre Categories
AI Recommendation
Academic Philoso
Professionalization Shift
Political Science History
Human Self-Mastery
Philosophy's evolution


In [99]:
representatives["documents"][8]

['work in the now specializing and restricted field of academic philosophy. These new philosophers functioned in independent departments of philosophy [...] They were making real gains in their research, creating a body of philosophic work that remains central to our study even now. These new philosophers also set their own standards for success, publishing in the recognized organs of philosophy that were being founded at the time: The Monist (1890), The International Journal of Ethics (1890), The Philosophical Review (1892), and The Journal of Philosophy, Psychology,']