# Erstellen der Kollektionen in Q-drant mit einem Sparse-Retrieval Modell

Hier werden die Daten für die Vektordatenbank vorbereitet und in diese geladen. Als Modell wird hier ein BI-Encoder Modell verwendet.

- Modell: naver/splade-cocondenser-ensembledistil
- Link: https://huggingface.co/naver/splade-cocondenser-ensembledistil

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
RAW_DATA_PATH = "drive/MyDrive/Uni/Master/Masterthesis/Data/raw_data/"
PROD_DATA_PATH = "drive/MyDrive/Uni/Master/Masterthesis/Data/topics/"
DATA_PATH = "drive/MyDrive/Uni/Master/Masterthesis/Data/"

## Laden der Expertenprofile

In [None]:
import json

# Pfad zur Datei
file_path = DATA_PATH + "enhanced_expert_profiles.json"

# JSON direkt einlesen
with open(file_path, "r", encoding="utf-8") as f:
    expert_data = json.load(f)

# Ausgabe prüfen
print(type(expert_data))  # sollte list oder dict sein
print(len(expert_data))   # Anzahl der Einträge prüfen
print(expert_data[0])     # Ersten Eintrag anzeigen
print(expert_data[0].keys())     # Ersten Eintrag anzeigen

<class 'list'>
307
{'branches': ['Technologie', 'Medien & Werbung', 'Unternehmensdienstleistungen'], 'companyLocationCity': 'Lennestadt', 'companyLocationStreet': 'Hagener Straße 64', 'companyLocationZip': 57368.0, 'companyName': 'lenne.Tech GmbH', 'companyTypes': ['Dienstleistung'], 'companyWebsite': 'https://lenne.tech', 'description': 'Seit über 15 Jahren begleite ich digitale Projekte von der Konzeption bis zur "schlüsselfertigen" Anwendung. Als Mitglied im Team von lenne.Tech (https://lenne.tech) entwickle ich Apps & Webanwendungen und unterstütze Unternehmen beim Aufbau sowie bei der Weiterbildung von Entwicklungsteams. Im Rahmen unserer lenne.Learning Akademie (https://lennelearning.de) organisiere ich gemeinsam mit anderen Tutoren & Mentoren das Recruiting sowie das Onboarding neuer Entwickler für Partnerunternehmen.', 'employeeOfInstitutionNames': [], 'firstName': 'Ege', 'gender': 'MALE', 'id': '65acfb5a3897d6f0e6506db1', 'jobTitle': 'Softwareentwickler', 'lastName': 'Siebert'

## Aufsetzen der Kollektionen

Quelle: https://qdrant.tech/documentation/beginner-tutorials/search-beginners/

Cluster: https://cloud.qdrant.io/accounts/35801706-940d-46e2-903a-fc1d0f88cda5/clusters/004ad81f-9442-4601-8579-089a7f905391/overview

In [None]:
!pip install -U sentence-transformers
!pip install -U qdrant-client
!pip install fastembed

Collecting qdrant-client
  Downloading qdrant_client-1.16.2-py3-none-any.whl.metadata (11 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading qdrant_client-1.16.2-py3-none-any.whl (377 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m377.2/377.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, qdrant-client
Successfully installed portalocker-3.2.0 qdrant-client-1.16.2
Collecting fastembed
  Downloading fastembed-0.7.4-py3-none-any.whl.metadata (10 kB)
Collecting loguru<0.8.0,>=0.7.2 (from fastembed)
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting onnxruntime!=1.20.0,>=1.17.0 (from fastembed)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting py-rust-stemmers<0.2.0,>=0.1.0 (from fastembed)
 

In [None]:
from fastembed import SparseTextEmbedding
from sentence_transformers.sparse_encoder import SparseEncoder



### Vorbereiten Expertenprofile

In [None]:
# only keep original data fields from experts
keys_to_remove = ["full_text_fields_used", "full_text_fields_used_fixed", "full_text_word_count", "full_text_fields_used_sorted", "validation_issues"]

expert_data = [
    {k: v for k, v in obj.items() if k not in keys_to_remove}
    for obj in expert_data
]

expert_data[0].keys()

dict_keys(['branches', 'companyLocationCity', 'companyLocationStreet', 'companyLocationZip', 'companyName', 'companyTypes', 'companyWebsite', 'description', 'employeeOfInstitutionNames', 'firstName', 'gender', 'id', 'jobTitle', 'lastName', 'projectsDescription', 'skills', 'title', 'full_text'])

### Laden von Qdrant Client

In [None]:
from qdrant_client import QdrantClient, models

client = QdrantClient(
    url="url_here",
    api_key="api_key_here",
)


  client = QdrantClient(


In [None]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='expert_data_fulltext_sparse'), CollectionDescription(name='expert_data_structured_dense'), CollectionDescription(name='expert_data_structured_and_fulltext_dense'), CollectionDescription(name='expert_data_structured_and_fulltext_late'), CollectionDescription(name='expert_data_fulltext_dense'), CollectionDescription(name='expert_data_structured_and_fulltext_sparse'), CollectionDescription(name='expert_data_fulltext_late'), CollectionDescription(name='expert_data_late_test'), CollectionDescription(name='expert_data_structured_sparse'), CollectionDescription(name='expert_data_structured_late')])

In [None]:
print(models.SparseVectorParams())

index=None modifier=None


#### Erstellen der Kollektionen


In [None]:
from qdrant_client.models import VectorParams, Distance

if not client.collection_exists("expert_data_structured_sparse"):
  client.create_collection(
      collection_name="expert_data_structured_sparse",
      vectors_config={},
      sparse_vectors_config={
          "neural_sparse_vector": models.SparseVectorParams(),
      },
  )

if not client.collection_exists("expert_data_fulltext_sparse"):
  client.create_collection(
      collection_name="expert_data_fulltext_sparse",
      vectors_config={},
      sparse_vectors_config={
          "neural_sparse_vector": models.SparseVectorParams(),
      },
  )

if not client.collection_exists("expert_data_structured_and_fulltext_sparse"):
  client.create_collection(
      collection_name="expert_data_structured_and_fulltext_sparse",
      vectors_config={},
      sparse_vectors_config={
          "neural_sparse_vector": models.SparseVectorParams(),
      },
  )

### Laden der Expertendaten

In [None]:
# Download from the 🤗 Hub
model = SparseEncoder("opensearch-project/opensearch-neural-sparse-encoding-multilingual-v1")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/108 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

router_config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

query_0_SparseStaticEmbedding/model.safe(…):   0%|          | 0.00/424k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/872 [00:00<?, ?B/s]

./model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/106 [00:00<?, ?B/s]

In [None]:
import torch
from qdrant_client import models


def encode_sparse_for_qdrant(text: str) -> models.SparseVector:
    # 1. Text → Sparse-Tensor (1D, vocab_size)
    tensor = model.encode_document(text)      # dein opensearch sparse encoder
    tensor = tensor.coalesce()

    idx = tensor.indices()   # Shape: [1, nnz]
    vals = tensor.values()   # Shape: [nnz]

    token_indices = idx[0].tolist()   # <-- genau das, was du in deinem Beispiel siehst
    token_values = vals.tolist()

    print(token_indices)
    print(token_values)

    return models.SparseVector(
        indices=token_indices,
        values=token_values,
    )

In [None]:
import copy

expert_data_copy = copy.deepcopy(expert_data)

# remove full_text
keys_to_remove = ["full_text"]

expert_data_without_full_text = [
    {k: v for k, v in obj.items() if k not in keys_to_remove}
    for obj in expert_data_copy
]

print(expert_data_without_full_text[0].keys())

dict_keys(['branches', 'companyLocationCity', 'companyLocationStreet', 'companyLocationZip', 'companyName', 'companyTypes', 'companyWebsite', 'description', 'employeeOfInstitutionNames', 'firstName', 'gender', 'id', 'jobTitle', 'lastName', 'projectsDescription', 'skills', 'title'])


In [None]:
import json

points_structured = []

for idx, doc in enumerate(expert_data_without_full_text):
    text = json.dumps(doc, ensure_ascii=False)
    sparse_vec = encode_sparse_for_qdrant(text)

    print(idx)

    points_structured.append(
        models.PointStruct(
            id=idx,
            vector={  # Name muss zum sparse_vectors_config passen
                "neural_sparse_vector": sparse_vec,
            },
            payload=doc,  # Payload = strukturierte Daten
        )
    )

client.upload_points(
    collection_name="expert_data_structured_sparse",
    points=points_structured,
)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
points_fulltext = []

for idx, doc in enumerate(expert_data):
    full_text = doc.get("full_text", "")
    if not full_text:
        full_text = ""  # zur Sicherheit

    print(idx)

    sparse_vec = encode_sparse_for_qdrant(full_text)

    points_fulltext.append(
        models.PointStruct(
            id=idx,
            vector={
                "neural_sparse_vector": sparse_vec,
            },
            # Du kannst hier payload=doc lassen, dann hast du alle Infos im Treffer
            payload={"full_text": full_text, **{k: v for k, v in doc.items() if k != "full_text"}},
        )
    )

client.upload_points(
    collection_name="expert_data_fulltext_sparse",
    points=points_fulltext,
)


0
[116, 137, 142, 6813, 10111, 10125, 10162, 10168, 10177, 10197, 10220, 10233, 10241, 10248, 10271, 10299, 10339, 10424, 10478, 10561, 10631, 10635, 10764, 10816, 10858, 10859, 10892, 11064, 11176, 11194, 11386, 11388, 11454, 11459, 11501, 11604, 11640, 11696, 11781, 11852, 11903, 12044, 12077, 12078, 12087, 12250, 12371, 12699, 12827, 13004, 13009, 13080, 13082, 13124, 13230, 13353, 13397, 13696, 13800, 13892, 13946, 14020, 14095, 14537, 14540, 14831, 14964, 14993, 15123, 16196, 16448, 17619, 18057, 18268, 18301, 18491, 18529, 18647, 18931, 19006, 19033, 19146, 19162, 19207, 19245, 19362, 19402, 19512, 19788, 19956, 20657, 20677, 21446, 21504, 22356, 22581, 23342, 23572, 23625, 23653, 24257, 24609, 24802, 25326, 26097, 26766, 26852, 26942, 27088, 27202, 27211, 27508, 28032, 28099, 28278, 28419, 28960, 29077, 29527, 30003, 30464, 30962, 31071, 31577, 31992, 32440, 32497, 33011, 33575, 33579, 34157, 34336, 35584, 35732, 35821, 36041, 36745, 37012, 37350, 38655, 40668, 41812, 42567, 429

In [None]:
points_structured_fulltext = []

for idx, doc in enumerate(expert_data):
    text = json.dumps(doc, ensure_ascii=False)  # jetzt MIT full_text
    sparse_vec = encode_sparse_for_qdrant(text)

    points_structured_fulltext.append(
        models.PointStruct(
            id=idx,
            vector={
                "neural_sparse_vector": sparse_vec,
            },
            payload=doc,
        )
    )

client.upload_points(
    collection_name="expert_data_structured_and_fulltext_sparse",
    points=points_structured_fulltext,
)



Output hidden; open in https://colab.research.google.com to view.