## Setup

In [1]:
import csv

In [2]:
with open("../data/review_links_small.csv", newline='') as csv_file:
	reader = csv.DictReader(csv_file)
	content_dict_list = [row_dict for row_dict in reader]

In [3]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()

True

In [4]:
llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")


## Current Content extractor

In [3]:

from dsview.content_loader import UrlLoader
from dsview.content_extraction import ContentExtractor


In [4]:
content_extractor = ContentExtractor(llm)


In [5]:
content_loader = UrlLoader(content_dict_list[0]["content_path"])
_, topics, _ = content_extractor.extract_content(content_loader)

In [10]:
topics.subjects

[DataScienceSubject(type='AI Company', name='Microsoft', description='A leading technology company known for its software products and services.'),
 DataScienceSubject(type='Model', name='GraphRAG', description='A data pipeline and transformation suite designed to extract structured data from unstructured text using LLMs.')]

In [11]:
content_loader = UrlLoader(content_dict_list[1]["content_path"])
_, topics, _ = content_extractor.extract_content(content_loader)

In [12]:
topics.subjects

[DataScienceSubject(type='Model', name='Claude 3.5 Sonnet', description='A model developed by Anthropic, known for its high performance across various context lengths.'),
 DataScienceSubject(type='Model', name='Gemini 1.5 Flash', description='A cost-effective model that balances performance and affordability, excelling in medium context tasks.'),
 DataScienceSubject(type='Model', name='Qwen2-72B-Instruct', description='An open-source model from Alibaba that performs well in short and medium context tasks.')]

In [13]:
content_loader = UrlLoader(content_dict_list[2]["content_path"])
_, topics, _ = content_extractor.extract_content(content_loader)

In [14]:
topics.subjects

[DataScienceSubject(type='Organization', name='GitHub', description='A platform for software development and version control using Git.'),
 DataScienceSubject(type='Product', name='GitHub Models', description='A new feature enabling developers to leverage AI models for software development.'),
 DataScienceSubject(type='Product', name='GitHub Copilot', description='An AI-powered code completion tool that assists developers in writing code.')]

## Separate subject llm

In [6]:
from typing import List

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

from dsview.config import load_content_extraction_config



In [7]:
config = load_content_extraction_config()

In [9]:
class DataScienceTopic(BaseModel):
    type: str = Field(
        default=None,
        description="Type of the described topic in the source.",
        enum=config.topics,
    )
    name: str = Field(
        default=None, description="Name of the described topic in the source."
    )
    description: str = Field(
        default=None,
        description=(
            "General and detailed description of the topic, it should not "
            "be a description of the source or how the source "
            "tackle this topic. But the description must be "
            "created using the provided or any prior knowledge."
        ),
    )
    
class TopicsList(BaseModel):
    topics: List[DataScienceTopic]

AttributeError: 'ContentExtractionConfig' object has no attribute 'topics'

In [133]:
topic_prompt = ChatPromptTemplate.from_messages(
	[
		("system", 
		"""You are an expert extraction algorithm specialized on Data Science related subjects.
		Your role is to extract relevant technical topics mentioned in a text.
		If you do not know the value of an attribute asked to extract,
		return null for the attribute's value."""
    ),
		("human", "Please extract informations from this source : {content}"),
	]
)


In [134]:
topic_extractor = topic_prompt | llm.with_structured_output(schema=TopicsList)

In [135]:
topics_list = topic_extractor.invoke({"content": content_dict_list[0]["content_path"]})

In [136]:
topics_list.topics

[DataScienceTopic(type='AI Company', name='Microsoft', description='Microsoft is a multinational technology company known for its software products, including the Windows operating system and Microsoft Office suite, as well as hardware products and cloud services.'),
 DataScienceTopic(type='Model', name='GraphRAG', description='GraphRAG is a framework developed by Microsoft for integrating graph-based reasoning with retrieval-augmented generation (RAG) models, enabling enhanced performance in tasks that require both knowledge retrieval and generative capabilities.'),
 DataScienceTopic(type='Python library', name='PyTorch', description='PyTorch is an open-source machine learning library based on the Torch library, used for applications such as computer vision and natural language processing, and is known for its flexibility and ease of use.'),
 DataScienceTopic(type='Python library', name='Transformers', description='Transformers is a library developed by Hugging Face that provides pre-

In [130]:
len(topics_list.topics[0].description)

196

In [131]:
len(topics_list.topics[1].description)

169

In [22]:
content_loader = UrlLoader(content_dict_list[0]["content_path"])
_, topics, _ = content_extractor.extract_content(content_loader)

In [24]:
topics.subjects

[DataScienceSubject(type='AI Company', name='Microsoft', description='Microsoft is a leading technology company known for its software products, cloud services, and AI research.'),
 DataScienceSubject(type='Model', name='GraphRAG', description='GraphRAG is a data pipeline and transformation suite designed to extract structured data from unstructured text using LLMs.')]

In [26]:
topics.subjects[0].description

'Microsoft is a leading technology company known for its software products, cloud services, and AI research.'

In [13]:
topics.subjects[1].description

'GraphRAG is a data pipeline and transformation suite designed to extract structured data from unstructured text using LLMs.'

In [25]:
len(topics.subjects[0].description)

107

In [14]:
len(topics.subjects[1].description)

123

## New content extractor

In [7]:

from dsview.content_loader import UrlLoader
from dsview.content_extraction import ContentExtractor


In [9]:
content_extractor = ContentExtractor(llm)


In [16]:
content_loader = UrlLoader(content_dict_list[0]["content_path"])
_, content_description, topics, _ = content_extractor.extract_content(content_loader)

In [17]:
content_description

ContentDescription(title='GraphRAG: A Modular RAG System', content_type='Repository', tags=[DataScienceTag(name='Retrieval Augmented Generation')])

In [18]:
topics

[DataScienceEntity(type='Organization', name='Microsoft', description='A multinational technology company known for its software products, including the Windows operating system and Microsoft Office suite.'),
 DataScienceEntity(type='Library', name='GraphRAG', description='A modular graph-based Retrieval-Augmented Generation (RAG) system designed to extract structured data from unstructured text using large language models (LLMs).')]

In [22]:
content_loader = UrlLoader(content_dict_list[1]["content_path"])
_, content_description, topics, _ = content_extractor.extract_content(content_loader)

In [23]:
content_description

ContentDescription(title='LLM Hallucination Index RAG Special', content_type='Blog post', tags=[DataScienceTag(name='Large Language Model'), DataScienceTag(name='Retrieval Augmented Generation'), DataScienceTag(name='Model evaluation')])

In [24]:
topics

[DataScienceEntity(type='AI Company', name='Anthropic', description='Anthropic is an AI safety and research company focused on developing advanced AI systems, including the Claude series of language models.'),
 DataScienceEntity(type='Model', name='claude-3-5-sonnet', description='Claude-3-5-Sonnet is a language model developed by Anthropic, known for its high performance in various context lengths.'),
 DataScienceEntity(type='Model', name='claude-3-opus', description='Claude-3-Opus is another variant of the Claude series by Anthropic, designed for improved performance in language tasks.'),
 DataScienceEntity(type='AI Company', name='Cohere', description='Cohere is an AI company that provides natural language processing models and tools, including the Command-R series.'),
 DataScienceEntity(type='Model', name='command-r-plus', description='Command-R Plus is a language model developed by Cohere, aimed at enhancing natural language understanding.'),
 DataScienceEntity(type='AI Company', 

## New subjects extraction tuning

In [20]:
from typing import List

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

from dsview.config import load_content_extraction_config

In [50]:
config = load_content_extraction_config()


class DataScienceTopic(BaseModel):
    type: str = Field(
        default=None,
        description="Type of the described topic in the source.",
        enum=config.components,
    )
    name: str = Field(
        default=None, description="Name of the described topic in the source."
    )
    description: str = Field(
        default=None,
        description=(
            "General and detailed description of the topic, it should not "
            "be a description of the source or how the source "
            "tackle this topic. But the description must be "
            "created using the provided information or any prior knowledge."
        ),    
    )
    link: str = Field(
		default=None,
		description="If it exists, a link to a source explaining or introducing the topic"
	)


class TopicList(BaseModel):
    entities: List[DataScienceTopic]

entity_prompt = ChatPromptTemplate.from_messages(
	[
		("system", 
		"""You are part of an knowledge management system, you assist Data Scientists in their technical 
		review process. The broader objective is to make it easier for a Data Scientist to track and 
		understand new trends and tools. To this effect a network graphical interface will be created that 
		will present knowledge and relations between sources.

		You will be provided a text about a Data Science or related subject. Your role is to identify the main technical 
		topics that are introduced, explained or mentioned. The objective is to quickly highlight within a source,
		what matter most and what could be used later by a Data Scientist. You should not extract details or specific
		information discussed in a text, the extracted topics must be as generic as possible or describe a specific practical tool. 
  		If the text is introducing a notion or a product, it must be included in the extracted topics. For example, 
		if the text is extracted for a repository main page, the name of the library should be extracted.

		You should ignore topics that are included in the following tags : {}.

		Reduce the number of topics to a minimum. You must extract at most 5 topics that represents the most
		crucial information within the text.  You may return less or none if 
		not enough information seems relevant.
  
    	""".format(", ".join(config.tags))
    ),
		("human", "Please extract topics from this source : {content}"),
	]
)

entity_extractor = entity_prompt | llm.with_structured_output(schema=TopicList)

You are an expert extraction algorithm specialized on Data Science related subjects.
Your role is to extract technical entities discussed in a text, you must only extract the top 5 or less 
most relevant entities that reflect the essential points of the text. In this context, an entity 
is either an organization (governement entity, company, ...) or a technical tool / product (library, model, ...).
You should not consider as entity the following informations : general technical concepts, technical architectures, model families, ... .
You may return no entities if no information fit.

introduced vs mentionned 

In [13]:
from dsview.content_loader import UrlLoader

In [62]:
content_loader = UrlLoader("https://github.com/pretzelai/pretzelai")
content_loader.load()

In [63]:
entities_list = entity_extractor.invoke({"content": content_loader.content, "links": content_loader.links})

In [64]:
entities_list.entities

[DataScienceTopic(type='Library', name='PretzelAI', description="A modern, open-source alternative to Jupyter Notebooks that enhances Jupyter's capabilities with features like AI code generation, inline tab completion, and a sidebar for AI interaction.", link='https://github.com/pretzelai/pretzelai'),
 DataScienceTopic(type='Platform', name='Pretzel', description='An improved version of Jupyter that allows for easy switching from Jupyter with support for existing configurations and extensions.', link='https://pretzelai.app'),
 DataScienceTopic(type='Concept', name='AI Code Generation', description='A feature that allows users to generate and edit code using AI, enhancing productivity and coding efficiency within the notebook environment.', link=None),
 DataScienceTopic(type='Concept', name='Inline Tab Completion', description='A functionality that provides suggestions for code completion as the user types in a cell, improving coding speed and accuracy.', link=None),
 DataScienceTopic(t

In [59]:
content_loader = UrlLoader("https://github.blog/news-insights/product-news/introducing-github-models/")
content_loader.load()

In [60]:
entities_list = entity_extractor.invoke({"content": content_loader.content, "links": content_loader.links})

In [61]:
entities_list.entities

[DataScienceTopic(type='Library', name='GitHub Models', description='A platform enabling developers to leverage AI models for building applications directly within GitHub.', link=None),
 DataScienceTopic(type='Model', name='Llama 3.1', description="A language model that can be accessed and tested within GitHub's model playground.", link=None),
 DataScienceTopic(type='Model', name='GPT-4o', description='An advanced language model suitable for building multimodal applications.', link=None),
 DataScienceTopic(type='Model', name='Mistral Large 2', description='A language model known for its low latency performance.', link=None),
 DataScienceTopic(type='Platform', name='Codespaces', description='A cloud-based development environment that allows developers to experiment with AI models and integrate them into their projects.', link=None)]

In [51]:
content_loader = UrlLoader("https://github.com/microsoft/graphrag")
content_loader.load()

In [54]:
entities_list = entity_extractor.invoke({"content": content_loader.content, "links": content_loader.links})

In [55]:
entities_list.entities

[DataScienceTopic(type='Library', name='GraphRAG', description='A modular graph-based Retrieval-Augmented Generation (RAG) system designed to enhance the reasoning capabilities of large language models (LLMs) by utilizing knowledge graph memory structures.', link=None),
 DataScienceTopic(type='Concept', name='Retrieval-Augmented Generation (RAG)', description='A framework that combines retrieval of relevant information from a dataset with generative capabilities of language models to produce more accurate and contextually relevant outputs.', link=None),
 DataScienceTopic(type='Concept', name='Knowledge Graph', description='A structured representation of information that captures relationships between entities, which can be used to enhance the performance of language models.', link=None),
 DataScienceTopic(type='Concept', name='Prompt Tuning', description='A technique for optimizing the input prompts given to language models to improve their performance on specific tasks.', link=None),


In [336]:
content_loader = UrlLoader("https://www.rungalileo.io/hallucinationindex")
content_loader.load()

In [337]:
entities_list = entity_extractor.invoke({"content": content_loader.content, "links": content_loader.links})

In [338]:
entities_list.entities

[DataScienceComponent(type='Concept', name='LLM Hallucination', description="The phenomenon where language models generate incorrect or nonsensical information, often referred to as 'hallucinations'. This is a critical area of study in evaluating the reliability of AI models.", link=None),
 DataScienceComponent(type='Methodology', name='Retrieval-Augmented Generation (RAG)', description='A method that enhances the performance of language models by incorporating additional context from external sources, improving the accuracy and relevance of generated responses.', link=None),
 DataScienceComponent(type='Model', name='Claude 3.5 Sonnet', description='A language model developed by Anthropic, noted for its high performance across various context lengths and tasks, particularly in minimizing hallucinations.', link=None),
 DataScienceComponent(type='Model', name='Gemini 1.5 Flash', description='A cost-effective language model that balances performance and affordability, recognized for its s

In [65]:
content_loader = UrlLoader("https://github.com/igrek51/wat")
content_loader.load()

In [66]:
entities_list = entity_extractor.invoke({"content": content_loader.content, "links": content_loader.links})

In [67]:
entities_list.entities

[DataScienceTopic(type='Library', name='wat-inspector', description='A powerful inspection tool for Python that allows users to examine unknown objects at runtime, providing insights into their type, value, methods, and more.', link=None),
 DataScienceTopic(type='Concept', name='Dynamic Typing', description='A feature of programming languages like Python where the type of a variable is determined at runtime, making it often challenging to ascertain the type of an object.', link=None),
 DataScienceTopic(type='Concept', name='Object Inspection', description='The process of examining the properties and methods of an object in programming, which can aid in understanding and debugging code.', link=None),
 DataScienceTopic(type='Concept', name='Python Interpreter', description='An environment where Python code is executed, allowing for interactive coding and debugging.', link=None),
 DataScienceTopic(type='Concept', name='Code Debugging', description='The process of identifying and removing 

In [68]:
content_loader = UrlLoader("https://docs.vllm.ai/en/latest/")
content_loader.load()

In [69]:
entities_list = entity_extractor.invoke({"content": content_loader.content, "links": content_loader.links})

In [70]:
entities_list.entities

[DataScienceTopic(type='Library', name='vLLM', description='vLLM is a fast and easy-to-use library for LLM inference and serving, optimized for high throughput and efficient memory management.', link=None),
 DataScienceTopic(type='Concept', name='PagedAttention', description='A memory management technique used in vLLM to efficiently handle attention key and value memory.', link=None),
 DataScienceTopic(type='Concept', name='Quantization', description='A technique used in vLLM to reduce the model size and improve inference speed, supporting various formats like GPTQ, AWQ, INT4, INT8, and FP8.', link=None),
 DataScienceTopic(type='Concept', name='Continuous Batching', description='A method implemented in vLLM to enhance throughput in LLM inference by batching incoming requests continuously.', link=None),
 DataScienceTopic(type='Platform', name='OpenAI Compatible Server', description="A server setup in vLLM that is compatible with OpenAI's API, allowing for seamless integration and deploy

In [71]:
content_list = content_loader.content.split(" ")

In [209]:
any([word.startswith("http") for word in content_list])

False

In [158]:
import requests

In [161]:
response = requests.get("https://openai.com/index/hello-gpt-4o")

In [162]:
response.status_code

403