In [1]:
from graph_db import Neo4jService, Neo4jSetting
import chromadb
from lite_llm import LiteLLMEmbeddingInput, LiteLLMService, LiteLLMSetting, LiteLLMInput
from pydantic import HttpUrl, SecretStr

litellm_setting=LiteLLMSetting(
    url=HttpUrl("http://localhost:9510"),
    token=SecretStr("abc123"),
    model="gemini-2.5-flash",
    frequency_penalty=0.0,
    n=1,
    temperature=0.0,
    top_p=1.0,
    max_completion_tokens=10000,
    dimension=1024,
    embedding_model="qwen3-embedding:0.6b"
)

litellm_service = LiteLLMService(litellm_setting=litellm_setting)

neo4j_service = Neo4jService(
    settings=Neo4jSetting(
        uri="bolt://localhost:17687",
        username="neo4j",
        password="4_Kz1pLYqtmVsxFJED_gxTN8rBcu4oQKAEqw9mm6zUHY"
    )
)

neo4j_service

Neo4jService(settings=Neo4jSetting(uri='bolt://localhost:17687', username='neo4j', password='4_Kz1pLYqtmVsxFJED_gxTN8rBcu4oQKAEqw9mm6zUHY'))

In [None]:
# CREATE VECTOR INDEX description_index
# FOR (d:Description)
# ON (d.embedding)
# OPTIONS {
#   indexConfig: {
#     `vector.dimensions`: 1024,
#     `vector.similarity_function`: 'cosine'
#   }
# };


In [None]:
import json 
from tqdm import tqdm
import numpy as np
import ollama 

def embedding_ollama(text): 
    response = ollama.embed("qwen3-embedding:0.6b", text)
    return response.embeddings[0]

course_code = "dsa2025"
course_results = []
topic_descs = []
for week_number in range(1, 9):
    with open(f'/home/lehoangvu/KLTN/outputs/gpt-4o-mini/{course_code}/week{week_number}_pipeline.json', 'r') as f:
        data = json.load(f)
        
    questions = data['questions']
    tmp = [q['topic']['description'] for q in questions]
    topic_descs.extend(tmp)

for query in tqdm(topic_descs):
    embedding = embedding_ollama(query)
    SIMILARITY_GETTING = """CALL db.index.vector.queryNodes($index_name, $query_nodes, $embedding)
    YIELD node, score WHERE node.type = 'ENTITY'
    MATCH (node)<-[:DESCRIBED]-(e:Entity)
    RETURN e.name AS name, e.type AS type, node.uid AS description_id, node.text AS description, node.chunk_uid AS chunk_id, score ORDER BY score DESC limit $k
    """
    results = await neo4j_service.execute_query(
        cypher=SIMILARITY_GETTING,
        parameters={
            'index_name': 'description_index',
            'embedding': embedding,
            'k': 10,
            'query_nodes': 100,
        },
        output_format='pandas',
    )
    
    list_results = results.to_dict(orient='records')
    course_results.extend([tmp['score'] for tmp in list_results])
    
mean_similarity = np.mean(course_results)  
print(f"Mean Quality Metrics over Course: {mean_similarity}")
std_similarity = np.std(course_results)
print(f"STD Quality Metrics over Course: {std_similarity}")
relevance_ratio = sum(1 for s in course_results if s >= 0.80) / len(course_results)
print(f"Relevance Ratio over Course: {relevance_ratio}")

100%|██████████| 63/63 [00:28<00:00,  2.19it/s]

Mean Quality Metrics over Course: 0.7983775346998184
STD Quality Metrics over Course: 0.03625653219966605
Relevance Ratio over Course: 0.47619047619047616





In [None]:
a = results.to_dict(orient='records')

[{'name': 'Gradient Descent',
  'type': 'technique',
  'description_id': 'a64c332d-ba73-4b6a-890a-3d55e3ee9dc7',
  'description': 'Gradient Descent là một thuật toán tối ưu hóa lặp đi lặp lại, cơ bản được sử dụng để tìm cực tiểu (minimize) một hàm mục tiêu (hàm loss hoặc hàm chi phí) bằng cách di chuyển theo hướng ngược lại của gradient của hàm đó. Thuật toán này cập nhật các tham số mô hình (weights $w$, $W^{[l]}$, $b^{[l]}$, hoặc hệ số $\\beta$) theo công thức chung: $w := w - \\alpha \\cdot \\nabla L$ (hoặc $w := w - \\eta \\cdot \\nabla L$, $W^{[l]} := W^{[l]} - \\alpha \\frac{\\partial L}{\\partial W^{[l]}}$, $b^{[l]} := b^{[l]} - \\alpha \\frac{\\partial L}{\\partial b^{[l]}}$, $\\beta_j := \\beta_j - \\alpha \\cdot \\frac{\\partial J(\\beta)}{\\partial \\beta_j}$), trong đó $\\alpha$ (hoặc $\\eta$) là learning rate và $\\nabla L$ (hoặc $\\frac{\\partial L}{\\partial w}$, $\\frac{\\partial J(\\beta)}{\\partial \\beta_j}$) là gradient của hàm loss $L$ (hoặc hàm chi phí $J$) đối vớ