# Erstellen der Kollektionen in Q-drant mit einem Bi-Encoder Modell

Hier werden die Daten für die Vektordatenbank vorbereitet und in diese geladen. Als Modell wird hier ein BI-Encoder Modell verwendet.

Modell: sentence-transformers/distiluse-base-multilingual-v2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
RAW_DATA_PATH = "drive/MyDrive/Uni/Master/Masterthesis/Data/raw_data/"
PROD_DATA_PATH = "drive/MyDrive/Uni/Master/Masterthesis/Data/topics/"
DATA_PATH = "drive/MyDrive/Uni/Master/Masterthesis/Data/"

## Laden der Expertenprofile

In [None]:
import json

# Pfad zur Datei
file_path = DATA_PATH + "enhanced_expert_profiles.json"

# JSON direkt einlesen
with open(file_path, "r", encoding="utf-8") as f:
    expert_data = json.load(f)

# Ausgabe prüfen
print(type(expert_data))  # sollte list oder dict sein
print(len(expert_data))   # Anzahl der Einträge prüfen
print(expert_data[0])     # Ersten Eintrag anzeigen
print(expert_data[0].keys())     # Ersten Eintrag anzeigen

<class 'list'>
307
{'branches': ['Technologie', 'Medien & Werbung', 'Unternehmensdienstleistungen'], 'companyLocationCity': 'Lennestadt', 'companyLocationStreet': 'Hagener Straße 64', 'companyLocationZip': 57368.0, 'companyName': 'lenne.Tech GmbH', 'companyTypes': ['Dienstleistung'], 'companyWebsite': 'https://lenne.tech', 'description': 'Seit über 15 Jahren begleite ich digitale Projekte von der Konzeption bis zur "schlüsselfertigen" Anwendung. Als Mitglied im Team von lenne.Tech (https://lenne.tech) entwickle ich Apps & Webanwendungen und unterstütze Unternehmen beim Aufbau sowie bei der Weiterbildung von Entwicklungsteams. Im Rahmen unserer lenne.Learning Akademie (https://lennelearning.de) organisiere ich gemeinsam mit anderen Tutoren & Mentoren das Recruiting sowie das Onboarding neuer Entwickler für Partnerunternehmen.', 'employeeOfInstitutionNames': [], 'firstName': 'Ege', 'gender': 'MALE', 'id': '65acfb5a3897d6f0e6506db1', 'jobTitle': 'Softwareentwickler', 'lastName': 'Siebert'

## Aufsetzen der Kollektionen

Quelle: https://qdrant.tech/documentation/beginner-tutorials/search-beginners/

Cluster: https://cloud.qdrant.io/accounts/35801706-940d-46e2-903a-fc1d0f88cda5/clusters/004ad81f-9442-4601-8579-089a7f905391/overview

In [None]:
!pip install -U sentence-transformers
!pip install -U qdrant-client

In [None]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

In [None]:
encoder = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [None]:
from qdrant_client import QdrantClient

client = QdrantClient(
    url="qdrant_url",
    api_key="api_key",
)


In [None]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='expert_data_structured_dense'), CollectionDescription(name='expert_data_fulltext_dense'), CollectionDescription(name='expert_data_structured_and_fulltext_dense')])

#### Erstellen der Kollektionen


In [None]:
from qdrant_client.models import VectorParams, Distance

if not client.collection_exists("expert_data_structured_dense"):
  client.create_collection(
      collection_name="expert_data_structured_dense",
      vectors_config=models.VectorParams(
          size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
          distance=models.Distance.COSINE,
      ),
  )

if not client.collection_exists("expert_data_fulltext_dense"):
  client.create_collection(
      collection_name="expert_data_fulltext_dense",
      vectors_config=models.VectorParams(
          size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
          distance=models.Distance.COSINE,
      ),
  )

if not client.collection_exists("expert_data_structured_and_fulltext_dense"):
  client.create_collection(
      collection_name="expert_data_structured_and_fulltext_dense",
      vectors_config=models.VectorParams(
          size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
          distance=models.Distance.COSINE,
      ),
  )

In [None]:
# only keep original data fields from experts
keys_to_remove = ["full_text_fields_used", "full_text_fields_used_fixed", "full_text_word_count", "full_text_fields_used_sorted", "validation_issues"]

expert_data = [
    {k: v for k, v in obj.items() if k not in keys_to_remove}
    for obj in expert_data
]

expert_data[0].keys()


[{'branches': ['Technologie', 'Medien & Werbung', 'Unternehmensdienstleistungen'], 'companyLocationCity': 'Lennestadt', 'companyLocationStreet': 'Hagener Straße 64', 'companyLocationZip': 57368.0, 'companyName': 'lenne.Tech GmbH', 'companyTypes': ['Dienstleistung'], 'companyWebsite': 'https://lenne.tech', 'description': 'Seit über 15 Jahren begleite ich digitale Projekte von der Konzeption bis zur "schlüsselfertigen" Anwendung. Als Mitglied im Team von lenne.Tech (https://lenne.tech) entwickle ich Apps & Webanwendungen und unterstütze Unternehmen beim Aufbau sowie bei der Weiterbildung von Entwicklungsteams. Im Rahmen unserer lenne.Learning Akademie (https://lennelearning.de) organisiere ich gemeinsam mit anderen Tutoren & Mentoren das Recruiting sowie das Onboarding neuer Entwickler für Partnerunternehmen.', 'employeeOfInstitutionNames': [], 'firstName': 'Ege', 'gender': 'MALE', 'id': '65acfb5a3897d6f0e6506db1', 'jobTitle': 'Softwareentwickler', 'lastName': 'Siebert', 'projectsDescrip

dict_keys(['branches', 'companyLocationCity', 'companyLocationStreet', 'companyLocationZip', 'companyName', 'companyTypes', 'companyWebsite', 'description', 'employeeOfInstitutionNames', 'firstName', 'gender', 'id', 'jobTitle', 'lastName', 'projectsDescription', 'skills', 'title', 'full_text'])

In [None]:
import copy

expert_data_copy = copy.deepcopy(expert_data)

# remove full_text
keys_to_remove = ["full_text"]

expert_data_without_full_text = [
    {k: v for k, v in obj.items() if k not in keys_to_remove}
    for obj in expert_data_copy
]

print(expert_data_without_full_text[0].keys())

dict_keys(['branches', 'companyLocationCity', 'companyLocationStreet', 'companyLocationZip', 'companyName', 'companyTypes', 'companyWebsite', 'description', 'employeeOfInstitutionNames', 'firstName', 'gender', 'id', 'jobTitle', 'lastName', 'projectsDescription', 'skills', 'title'])


#### Laden der Expertendaten in die Kollektionen

In [None]:

## load only the structured original expert data without additional full_text
client.upload_points(
    collection_name="expert_data_structured_dense",
    points=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(
                json.dumps(doc, ensure_ascii=False)
            ).tolist(),
            payload=doc,
        )
        for idx, doc in enumerate(expert_data_without_full_text)
    ],
)


#load only the generated fulltext for each expert object
client.upload_points(
    collection_name="expert_data_fulltext_dense",
    points=[
        models.PointStruct(
            id=idx, vector=encoder.encode(doc["full_text"]).tolist(), payload=doc
        )
        for idx, doc in enumerate(expert_data)
    ],
)



#load only the structured original expert data without additional full_text
client.upload_points(
    collection_name="expert_data_structured_and_fulltext_dense",
    points=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(
                json.dumps(doc, ensure_ascii=False)
            ).tolist(),
            payload=doc,
        )
        for idx, doc in enumerate(expert_data)
    ],
)

In [None]:
## test querying the collection

hits = client.query_points(
    collection_name="expert_data_structured_dense",
    query=encoder.encode("Business Development").tolist(),
    limit=3,
).points

for hit in hits:
    print(hit.payload["skills"])
    print(hit.payload, "score:", hit.score)

['Business Development', 'Change Management', 'Coaching', 'Communication', 'Conflict Management', 'Employee Development', 'Geschäftsmodellentwicklung', 'Knowledge Management', 'Performance Management', 'Unternehmensführung']
{'branches': ['Unternehmensdienstleistungen'], 'companyLocationCity': 'Wuppertal', 'companyLocationStreet': 'Kuchhauser Str. 86', 'companyLocationZip': 42349.0, 'companyName': 'AO Consulting Christina Gawlig', 'companyTypes': ['Dienstleistung'], 'companyWebsite': 'consulting-gawlig.de', 'description': 'Unternehmensentwicklung & Change Management\nLeidenschaftlich.\nPersönlich.\nTop Down und Bottom Up.\n\nDie etwas andere Unternehmensberatung.', 'employeeOfInstitutionNames': [], 'firstName': 'Caroline', 'gender': 'FEMALE', 'id': '682adab50b2e403ede03a46a', 'jobTitle': 'Change Managerin', 'lastName': 'Ruth', 'projectsDescription': 'Unternehmensberatung: Change Management', 'skills': ['Business Development', 'Change Management', 'Coaching', 'Communication', 'Conflict 