In [None]:
from langchain.vectorstores import Qdrant
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

In [None]:
# Инициализация модели для эмбеддингов
embeddings = HuggingFaceEmbeddings(model_name="ai-forever/sbert_large_nlu_ru")

# Create docs
____

In [None]:
from datasets import load_dataset

dataset = load_dataset("RussianNLP/Mixed-Summarization-Dataset")
dataset

In [None]:
# Создаем список документов (можно заменить на загрузку из файла)
def create_docs(dataset, num_samples=6_000, seed=42):
    texts = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    documents = [
        Document(
            page_content=doc["text"],
            metadata={"source": "wiki_ai", "page": i} # doc id
        )
        for i, doc in enumerate(texts)
    ]
    return documents

In [None]:
num_samples = 20_000
documents = create_docs(dataset=dataset, num_samples=num_samples)
documents[:3]

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " "]
)
texts = text_splitter.split_documents(documents)
texts

In [None]:
# Создаем или подключаемся к коллекции в Qdrant
qdrant_db = Qdrant.from_documents(
    documents=texts,
    embedding=embeddings,
    url="http://localhost:6333",  # Адрес Qdrant
    collection_name="ai_documents",
    force_recreate=True  # Пересоздать коллекцию если существует
)