In [2]:
import importlib

from dotenv import load_dotenv; load_dotenv()
from txtai.embeddings import Embeddings

from lib import db
importlib.reload(db)

<module 'lib.db' from '/Users/dennis/dev/kg1/research/lib/db/__init__.py'>

In [3]:
chunks = db.get_text_node_chunks(limit=10000)
chunks = [c["text"] for c in chunks]

docs = db.get_text_nodes(limit=10000)
docs = [c["text"] for c in docs]

In [None]:
embeddings = Embeddings({
  "path": "sentence-transformers/all-MiniLM-L6-v2",
  "content": True,
  "functions": [
    {"name": "graph", "function": "graph.attribute"},
  ],
  "expressions": [
      {"name": "category", "expression": "graph(indexid, 'category')"},
      {"name": "topic", "expression": "graph(indexid, 'topic')"},
      {"name": "topicrank", "expression": "graph(indexid, 'topicrank')"}
  ],
  "graph": {
      "limit": 15,
      "minscore": 0.1,
      "topics": {
          "categories": ["Society & Culture", "Science & Mathematics", "Health", "Education & Reference", "Computers & Internet", "Sports",
                         "Business & Finance", "Entertainment & Music", "Family & Relationships", "Politics & Government"]
      }
  }
})

embeddings.index((idx, doc, None) for idx, doc in enumerate(chunks))

In [None]:
graph = embeddings.graph
len(embeddings.graph.topics)

In [None]:
list(graph.topics.keys())[:10]

In [None]:
print(embeddings.search("select text from txtai where topic = 'llms_llm_i_we' and topicrank = 0", 2)[0]["text"])

In [None]:
for x, topic in enumerate(list(graph.topics.keys())[:5]):
    print(graph.categories[x], topic)

In [None]:
embeddings.search("select * from txtai where similar('LLM')", 10)

In [None]:
centrality = graph.centrality()

topics = list(graph.topics.keys())

for uid in list(centrality.keys())[:5]:
    topic = graph.attribute(uid, "topic")
    print(f"{topic} ({topics.index(topic)})")

In [15]:
import openai
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, OpenAI

client = openai.OpenAI()
representation_model = OpenAI(client, model="gpt-4o", chat=True)

#topic_model = BERTopic(representation_model=representation_model)
topic_model = BERTopic(representation_model=KeyBERTInspired())
topics, probs = topic_model.fit_transform(chunks)

In [16]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2364,-1_text_models_attention_llms,"[text, models, attention, llms, llm, model, se...","[Pretty simple, right?\nWe’ll start at the las..."
1,0,493,0_developer_frontend_devops_backend,"[developer, frontend, devops, backend, dev, no...",[- Node.js\n- React\n- Firebase\n- GraphQL\nJa...
2,1,218,1_python_interpreter_syntax_wtfpython,"[python, interpreter, syntax, wtfpython, opera...",[False\n>>> WTF() is WTF() # identities are al...
3,2,172,2_causal_causality_causation_inference,"[causal, causality, causation, inference, obse...","[Given these challenges, how do we combine cau..."
4,3,157,3_utterance_faux_tests_knowledge,"[utterance, faux, tests, knowledge, comprehens...",[To manipulate the likelihood that the speaker...
...,...,...,...,...,...
212,211,11,211_layers_residuals_residual_neural,"[layers, residuals, residual, neural, normaliz...",[There are two ways to place layer normalizati...
213,212,11,212_writers_emacs_shellmaker_toolkit,"[writers, emacs, shellmaker, toolkit, adventur...",[74 | 90 | 2022 | $ rm Important.txt (uh oh) |...
214,213,11,213_notebooklms_notebooklm_notebook_ai,"[notebooklms, notebooklm, notebook, ai, brains...",[NotebookLM: an AI notebook for everyone\nNote...
215,214,10,214_autochain_autochainchainchain_autochainmod...,"[autochain, autochainchainchain, autochainmode...","[If you have experience with LangChain, you al..."


In [17]:
topic_model.get_topic(0)

[('developer', 0.56650937),
 ('frontend', 0.510833),
 ('devops', 0.50750554),
 ('backend', 0.48928258),
 ('dev', 0.4727406),
 ('nodejs', 0.43792847),
 ('github', 0.4359008),
 ('wordpress', 0.40875238),
 ('web', 0.40300047),
 ('development', 0.40279642)]

In [19]:
topic_model.get_document_info(chunks)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,LSP-AI is an open source language server that ...,-1,-1_text_models_attention_llms,"[text, models, attention, llms, llm, model, se...","[Pretty simple, right?\nWe’ll start at the las...",text - models - attention - llms - llm - model...,0.000000,False
1,- VS Code\n- NeoVim\n- Emacs\n- Helix\n- Subli...,140,140_editor_editors_backend_collaborative,"[editor, editors, backend, collaborative, coll...","[Google Docs, or whatever you need. It&#x27;s ...",editor - editors - backend - collaborative - c...,0.797016,False
2,tl;dr LSP-AI abstracts complex implementation ...,140,140_editor_editors_backend_collaborative,"[editor, editors, backend, collaborative, coll...","[Google Docs, or whatever you need. It&#x27;s ...",editor - editors - backend - collaborative - c...,0.848727,False
3,LSP-AI aims to fill this gap by providing a la...,-1,-1_text_models_attention_llms,"[text, models, attention, llms, llm, model, se...","[Pretty simple, right?\nWe’ll start at the las...",text - models - attention - llms - llm - model...,0.000000,False
4,-\nSimplified Plugin Development:\n- LSP-AI ab...,140,140_editor_editors_backend_collaborative,"[editor, editors, backend, collaborative, coll...","[Google Docs, or whatever you need. It&#x27;s ...",editor - editors - backend - collaborative - c...,0.875554,False
...,...,...,...,...,...,...,...,...
9995,"Swyx: Stand-in. [00:59:09]\nGeorge: Well, no, ...",25,25_uber_swyx_api_george,"[uber, swyx, api, george, just, scam, like, li...",[Swyx: it's a scam. [00:39:49]\nGeorge: If the...,uber - swyx - api - george - just - scam - lik...,1.000000,False
9996,George: maybe the Elon style way of thinking a...,-1,-1_text_models_attention_llms,"[text, models, attention, llms, llm, model, se...","[Pretty simple, right?\nWe’ll start at the las...",text - models - attention - llms - llm - model...,0.000000,False
9997,"have. Right, so I don't need to put, you know,...",5,5_ai_opensource_cloud_ibm,"[ai, opensource, cloud, ibm, ibms, developers,...",[I’m now of the opinion that this is the time ...,ai - opensource - cloud - ibm - ibms - develop...,0.709685,False
9998,"Swyx: In terms of merging, like, isn't it, sho...",25,25_uber_swyx_api_george,"[uber, swyx, api, george, just, scam, like, li...",[Swyx: it's a scam. [00:39:49]\nGeorge: If the...,uber - swyx - api - george - just - scam - lik...,0.231928,False


In [20]:
topic_model.visualize_topics(top_n_topics=100)

In [21]:
similar_topics, similarity = topic_model.find_topics("llm", top_n=5)

In [22]:
topic_model.get_topic(similar_topics[0])

[('chess', 0.4384939),
 ('intentions', 0.41864693),
 ('llms', 0.38681805),
 ('llm', 0.36714536),
 ('conversation', 0.3384717),
 ('intentionality', 0.30149746),
 ('act', 0.27352434),
 ('speech', 0.25560164),
 ('play', 0.25285593),
 ('interpret', 0.24501204)]