In [1]:
import asyncio
import logging
from typing import Union

from cognee.modules.cognify.config import get_cognify_config
from cognee.shared.data_models import KnowledgeGraph
from cognee.modules.data.models import Dataset, Data
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
from cognee.modules.data.methods import get_datasets, get_datasets_by_name
from cognee.modules.pipelines.tasks.Task import Task
from cognee.modules.pipelines import run_tasks, run_tasks_parallel
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_default_user
from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status
from cognee.tasks import chunk_extract_summary, \
    chunk_naive_llm_classifier, \
    chunk_remove_disconnected, \
    infer_data_ontology, \
    save_chunks_to_store, \
    chunk_update_check, \
    chunks_into_graph, \
    source_documents_to_chunks, \
    check_permissions_on_documents, \
    classify_documents

In [2]:
job_position = """Senior Data Scientist (Machine Learning)

Company: TechNova Solutions
Location: San Francisco, CA

Job Description:

TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.

Responsibilities:

Develop and implement advanced machine learning algorithms and models.
Analyze large, complex datasets to extract meaningful patterns and insights.
Collaborate with cross-functional teams to integrate predictive models into products.
Stay updated with the latest advancements in machine learning and data science.
Mentor junior data scientists and provide technical guidance.
Qualifications:

Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field.
5+ years of experience in data science and machine learning.
Proficient in Python, R, and SQL.
Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).
Strong problem-solving skills and attention to detail.
Candidate CVs
"""


In [3]:
job_1 = """
CV 1: Relevant
Name: Dr. Emily Carter
Contact Information:

Email: emily.carter@example.com
Phone: (555) 123-4567
Summary:

Senior Data Scientist with over 8 years of experience in machine learning and predictive analytics. Expertise in developing advanced algorithms and deploying scalable models in production environments.

Education:

Ph.D. in Computer Science, Stanford University (2014)
B.S. in Mathematics, University of California, Berkeley (2010)
Experience:

Senior Data Scientist, InnovateAI Labs (2016 – Present)
Led a team in developing machine learning models for natural language processing applications.
Implemented deep learning algorithms that improved prediction accuracy by 25%.
Collaborated with cross-functional teams to integrate models into cloud-based platforms.
Data Scientist, DataWave Analytics (2014 – 2016)
Developed predictive models for customer segmentation and churn analysis.
Analyzed large datasets using Hadoop and Spark frameworks.
Skills:

Programming Languages: Python, R, SQL
Machine Learning: TensorFlow, Keras, Scikit-Learn
Big Data Technologies: Hadoop, Spark
Data Visualization: Tableau, Matplotlib
"""

In [4]:
job_2 = """
CV 2: Relevant
Name: Michael Rodriguez
Contact Information:

Email: michael.rodriguez@example.com
Phone: (555) 234-5678
Summary:

Data Scientist with a strong background in machine learning and statistical modeling. Skilled in handling large datasets and translating data into actionable business insights.

Education:

M.S. in Data Science, Carnegie Mellon University (2013)
B.S. in Computer Science, University of Michigan (2011)
Experience:

Senior Data Scientist, Alpha Analytics (2017 – Present)
Developed machine learning models to optimize marketing strategies.
Reduced customer acquisition cost by 15% through predictive modeling.
Data Scientist, TechInsights (2013 – 2017)
Analyzed user behavior data to improve product features.
Implemented A/B testing frameworks to evaluate product changes.
Skills:

Programming Languages: Python, Java, SQL
Machine Learning: Scikit-Learn, XGBoost
Data Visualization: Seaborn, Plotly
Databases: MySQL, MongoDB
"""

In [5]:
job_3 = """
CV 3: Relevant
Name: Sarah Nguyen
Contact Information:

Email: sarah.nguyen@example.com
Phone: (555) 345-6789
Summary:

Data Scientist specializing in machine learning with 6 years of experience. Passionate about leveraging data to drive business solutions and improve product performance.

Education:

M.S. in Statistics, University of Washington (2014)
B.S. in Applied Mathematics, University of Texas at Austin (2012)
Experience:

Data Scientist, QuantumTech (2016 – Present)
Designed and implemented machine learning algorithms for financial forecasting.
Improved model efficiency by 20% through algorithm optimization.
Junior Data Scientist, DataCore Solutions (2014 – 2016)
Assisted in developing predictive models for supply chain optimization.
Conducted data cleaning and preprocessing on large datasets.
Skills:

Programming Languages: Python, R
Machine Learning Frameworks: PyTorch, Scikit-Learn
Statistical Analysis: SAS, SPSS
Cloud Platforms: AWS, Azure
"""

In [6]:
job_4 = """
CV 4: Not Relevant
Name: David Thompson
Contact Information:

Email: david.thompson@example.com
Phone: (555) 456-7890
Summary:

Creative Graphic Designer with over 8 years of experience in visual design and branding. Proficient in Adobe Creative Suite and passionate about creating compelling visuals.

Education:

B.F.A. in Graphic Design, Rhode Island School of Design (2012)
Experience:

Senior Graphic Designer, CreativeWorks Agency (2015 – Present)
Led design projects for clients in various industries.
Created branding materials that increased client engagement by 30%.
Graphic Designer, Visual Innovations (2012 – 2015)
Designed marketing collateral, including brochures, logos, and websites.
Collaborated with the marketing team to develop cohesive brand strategies.
Skills:

Design Software: Adobe Photoshop, Illustrator, InDesign
Web Design: HTML, CSS
Specialties: Branding and Identity, Typography
"""

In [7]:
job_5 = """
CV 5: Not Relevant
Name: Jessica Miller
Contact Information:

Email: jessica.miller@example.com
Phone: (555) 567-8901
Summary:

Experienced Sales Manager with a strong track record in driving sales growth and building high-performing teams. Excellent communication and leadership skills.

Education:

B.A. in Business Administration, University of Southern California (2010)
Experience:

Sales Manager, Global Enterprises (2015 – Present)
Managed a sales team of 15 members, achieving a 20% increase in annual revenue.
Developed sales strategies that expanded customer base by 25%.
Sales Representative, Market Leaders Inc. (2010 – 2015)
Consistently exceeded sales targets and received the 'Top Salesperson' award in 2013.
Skills:

Sales Strategy and Planning
Team Leadership and Development
CRM Software: Salesforce, Zoho
Negotiation and Relationship Building
"""

In [None]:
import cognee
from os import listdir, path

data_path = path.abspath(".data")

results = await cognee.add([job_1, job_2,job_3,job_4,job_5,job_position], "example")

for result in results:
    print(result)

In [9]:
# from enum import Enum, auto
# from typing import Optional, List, Union, Dict, Any
# from pydantic import BaseModel, Field
# 
# class Node(BaseModel):
#     """Node in a knowledge graph."""
#     id: str
#     name: str
#     type: str
#     description: str
#     properties: Optional[Dict[str, Any]] = Field(None, description = "A dictionary of properties associated with the node.")
# 
# class Edge(BaseModel):
#     """Edge in a knowledge graph."""
#     source_node_id: str
#     target_node_id: str
#     relationship_name: str
#     properties: Optional[Dict[str, Any]] = Field(None, description = "A dictionary of properties associated with the edge.")
# 
# class KnowledgeGraph(BaseModel):
#     """Knowledge graph."""
#     nodes: List[Node] = Field(..., default_factory=list)
#     edges: List[Edge] = Field(..., default_factory=list)

In [10]:
async def run_cognify_pipeline(dataset: Dataset, user: User = None):
    data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)

    try:

        root_node_id = None

        tasks = [
            Task(classify_documents),
            Task(check_permissions_on_documents, user = user, permissions = ["write"]),
            Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),
            Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type
            Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = "entities", task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes
            Task(chunk_update_check, collection_name = "chunks"), # Find all affected chunks, so we don't process unchanged chunks
            Task(
                save_chunks_to_store,
                collection_name = "chunks",
            ), 
            Task(chunk_remove_disconnected), # Remove the obsolete document chunks.
        ]

        pipeline = run_tasks(tasks, data_documents)

        async for result in pipeline:
            print(result)
    except Exception as error:
        raise error

In [None]:
user = await get_default_user()
datasets = await get_datasets_by_name(["example"], user.id)
await run_cognify_pipeline(datasets[0], user)

In [None]:
import os
from cognee.shared.utils import render_graph
from cognee.infrastructure.databases.graph import get_graph_engine
import graphistry

# # Setting an environment variable
# os.environ["GRAPHISTRY_USERNAME"] = placeholder
# os.environ["GRAPHISTRY_PASSWORD"] = placeholder


graphistry.login(username=os.getenv("GRAPHISTRY_USERNAME"), password=os.getenv("GRAPHISTRY_PASSWORD"))

graph_engine = await get_graph_engine()

graph_url = await render_graph(graph_engine.graph)
print(graph_url)

In [None]:
async def search(
    vector_engine,
    collection_name: str,
    query_text: str = None,
):
    query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]

    connection = await vector_engine.get_connection()
    collection = await connection.open_table(collection_name)

    results = await collection.vector_search(query_vector).limit(10).to_pandas()

    result_values = list(results.to_dict("index").values())

    return [dict(
        id = str(result["id"]),
        payload = result["payload"],
        score = result["_distance"],
    ) for result in result_values]


from cognee.infrastructure.databases.vector import get_vector_engine

vector_engine = get_vector_engine()
results = await search(vector_engine, "entities", "sarah.nguyen@example.com")
for result in results:
    print(result)