### Package installations

In [1]:
#! pip install --upgrade pip
#! pip3 install -q towhee pymilvus==2.2.11
#! pip3 uninstall pymilvus -y

! pip3 install -q towhee pymilvus==2.1.1
! pip3 install transformers -q
! pip3 install pandas -q
! pip3 install tqdm -q
! pip3 show pymilvus | grep -Ei 'Name:|Version:'
! pip3 show towhee | grep -Ei 'Name:|Version:'
#! pip3 show transformers | grep -Ei 'Name:|Version:'

Name: pymilvus
Version: 2.1.1
Name: towhee
Version: 1.1.3


## 1 Clean dataset for special characters, and output csv

In [2]:
import pandas as pd
import re

# Clean the 'abstract' column for weird substrings
def clean_abstract(text):
    if pd.isnull(text):
        return text

    text = text.replace('"', '').replace('\\', '')

    # Pattern to match substrings starting with '(', containing chars, followed by 'http', and ending with ')', or that start and ending with commas, that also containins 'http'
    link_pattern_xor = r'\([^)]*http[^)]*\)|,([^,]*http[^,]*),'
    isolated_link_pattern = r'https?://\S+(?=\s|$)'
    # Regex pattern for replacing trailing spaces with a single space
    trailing_space_pattern = r' {2,}'

    # Removes links within parentheses and commas, then replaces trailing spaces with single space
    text = re.sub(trailing_space_pattern, ' ', re.sub(isolated_link_pattern, '', re.sub(link_pattern_xor, '', text)))

    return text



# Replaces special characters on text columns
def clean_special_substrings(text):
    # Removes special characters and weird substrings
    text = text.replace(' v/ ', ' ').replace('\\n', ' ').replace('▬', '').replace('\u00A0', ' ')

    return text


def clean_hash_delimiter(text):
    text = text.replace('###', ',')

    return text


def clean_constraints_column(text):
    if text == '###':
        text = text.replace('###', '')
    else:
        text = text.replace('###', ',')
    
    return text


def clean_security_constraints(text):
    text = text.replace('#########', '').replace('######', ',').replace('###', ',')

    return text


def clean_legal_constraints(text):
    text = text.replace('######', ',').replace('###', ',')

    return text


def clean_contact(text):
    text = text.replace('###', ',')

    return text


def clean_links(text):
    if text[:3] == '###':
        text = text[3:]

    text = text.replace('###', ',')
    
    return text


def clean_product_info(text):
    text = text.replace('###', ',')

    return text



dataset_file = 'Metadata_excel.xlsx'
df = pd.read_excel(dataset_file)

# Fill NaN values with an empty string
df.fillna('', inplace=True)

columns_clean_special_chars = [
    'schema', 'uuid', 'hierarchyLevel', 'title', 'abstract', 'keyword', 'geoBox', 'Constraints', 
    'SecurityConstraints', 'LegalConstraints', 'temporalExtent', 'responsibleParty',  
    'productInformation', 'parentId'
]
# Apply cleaning functions columns
for col in df.columns:
    # Apply cleaning to all string columns
    if df[col].dtype == 'object':
        if col in columns_clean_special_chars:
            df[col] = df[col].apply(clean_special_substrings)
            #df[col] = df[col].apply(replace_norwegian_characters)

        # Include cleaning of weird substrings
        if col == 'abstract':
            df[col] = df[col].apply(clean_abstract)
        
        # Clean keyword column
        if col == 'keyword':
            df[col] = df[col].apply(clean_hash_delimiter)

        # Clean geoBox column
        if col == 'geoBox':
            df[col] = df[col].apply(clean_hash_delimiter)

         # Clean Constraints column
        if col == 'Constraints':
            df[col] = df[col].apply(clean_constraints_column)
       
        # Clean SecurityConstraints column
        if col == 'SecurityConstraints':
            df[col] = df[col].apply(clean_security_constraints)
 
        # Clean LegalConstraints column
        if col == 'LegalConstraints':
            df[col] = df[col].apply(clean_legal_constraints)
        
        # Clean LegalConstraints column
        if col == 'responsibleParty':
            df[col] = df[col].apply(clean_contact)

        # Clean productInformation column
        if col == 'productInformation':
            df[col] = df[col].apply(clean_product_info)

        # Clean links columns
        if col in ['image', 'link']:
            df[col] = df[col].apply(clean_links)

 
cleaned_csv_file = 'output_metadata.csv'
df.to_csv(cleaned_csv_file, sep='|', index=False)
df.head()

Unnamed: 0,schema,uuid,id,hierarchyLevel,title,datasetcreationdate,abstract,keyword,geoBox,Constraints,SecurityConstraints,LegalConstraints,temporalExtent,image,responsibleParty,link,metadatacreationdate,productInformation,parentId
0,iso19139,7a62f16f-9aeb-4c39-bf5f-e710232fa366,37228,software,Artsfunn,,Datasettet inneholder stedfestet informasjon a...,"Natur,Norge,Svalbard,lav,karplanter,botanikkda...",2335781,,,Tilgangsrestriksjoner Andre restriksjoner: Lim...,0001-01-01now,https://editor.geonorge.no/thumbnails/7a62f16f...,Frank HansenNorsk institutt for naturforskning...,https://ipt.nina.no/,2021-03-24,"Produktspesifikasjon,Produktark,Produktside,Te...",
1,iso19139,79013154-92ee-4647-b160-925cbc148601,21400,dataset,Hav og is - Iskart (shapefil),,Istjenesten ved Meteorologisk institutt utarbe...,"Oceanographic geographical features,Inspire,No...","2.00,33.00,57.00,72.00",Bruksbegrensninger Ingen begrensninger på bruk...,Sikkerhetsnivå Ugradert: Available for general...,Tilgangsrestriksjoner Andre restriksjoner: Lim...,0001-01-01now,https://www.geonorge.no/geonetwork/srv/nor/res...,"Meteorologisk instituttistjenesten@met.no,Mete...",http://polarview.met.no/,2023-11-15,"Produktspesifikasjon,Produktark,Produktside,Te...",
2,iso19139,f0083871-0d21-44e2-945f-9de9ea94d484,240,dataset,Losbordingsfelt,,Bordingsfelt er angitt som et geografisk punkt...,"Åpne data,Norge digitalt,modellbaserteVegprosj...","2.3987,33.2045,57.5765,71.3531",,Sikkerhetsnivå Ugradert: Available for general...,Tilgangsrestriksjoner Andre restriksjoner: Lim...,0001-01-01now,https://editor.geonorge.no/thumbnails/f0083871...,Stian AamotKystverket37019700Kystveien 30Arend...,javascript:addWMSServerLayers(\https://kystinf...,2023-11-15,https://register.geonorge.no/register/versjone...,
3,iso19139,e379ef5e-8851-4305-b900-44a4587cf14c,21273,dataset,Radnett - doseratemålestasjoner,,Datasettet inneholder strålevernets radnettsta...,"Norge digitalt,Åpne data,modellbaserteVegprosj...","2,33,57.00000000000001,72",Bruksbegrensninger Ingen begrensninger på bruk...,Sikkerhetsnivå Ugradert: Available for general...,Tilgangsrestriksjoner Andre restriksjoner: Lim...,0001-01-01now,https://editor.geonorge.no/thumbnails/e379ef5e...,"Statens strålevernnrpa@nrpa.no,Direktoratet fo...",https://radnett.dsa.no/index.html,2023-11-15,"Produktspesifikasjon,Produktark,Produktside,Te...",
4,iso19139,41ccca92-2ae9-43c9-9a45-b3d6424d1633,37251,dataset,Predikert utbredelse og tetthetsfordeling av s...,,Basert på gamle og nye data for forekomst av s...,"Species distribution,Norge digitalt,modellbase...",2335781,"Bruksbegrensninger Ingen ,IngenNo conditions a...",Sikkerhetsnivå Ugradert: Available for general...,Tilgangsrestriksjoner Andre restriksjoner: Lim...,0001-01-01now,https://www.geonorge.no/geonetwork/srv/nor/res...,Norsk institutt for naturforskningfrank.hansse...,http://www.seapop.no/no/spread/open-sea/specie...,2023-11-15,"Produktspesifikasjon,Produktark,Produktside,Te...",


## 2 Load dataset and vectorise chosen columns

In [None]:
import pandas as pd
from towhee import pipe, ops, DataCollection
from transformers import AutoTokenizer
from tqdm import tqdm

# Function to compute embeddings for a single text
def compute_embeddings(text):
    MAX_TOKENS = 510 
    inputs = tokenizer(text, return_tensors="pt", max_length=MAX_TOKENS, truncation=True)
    truncated_text = tokenizer.decode(inputs["input_ids"][0])

    return DataCollection(embeddings_pipe(truncated_text)).to_list()[0]['vec'].tolist()


# Loads dataset into dataframe and recasts columns into correct datatypes
df_kartverket = pd.read_csv(cleaned_csv_file, sep='|')
recast_to_string = ['datasetcreationdate', 'metadatacreationdate']
df_kartverket[recast_to_string] = df_kartverket[recast_to_string].astype('object')

# Fill NaN values with an empty string
df_kartverket.fillna('', inplace=True)

# Pipe converting text to embeddings (vectors) using a model
facebook_context_model_name = 'facebook/dpr-ctx_encoder-single-nq-base'
#facebook_question_model_name = 'facebook/dpr-question_encoder-single-nq-base'
mbert_model_name = 'bert-base-multilingual-uncased'

chosen_model = facebook_context_model_name
tokenizer = AutoTokenizer.from_pretrained(chosen_model)
embeddings_pipe = (
    pipe.input('text')
        .map('text', 'vec', ops.text_embedding.dpr(model_name=chosen_model))
        .output('vec')
)

# Process each column and create new columns for embeddings
columns_to_vectorise = ['title', 'abstract', 'keyword', 'geoBox', 'Constraints', 'SecurityConstraints', 'LegalConstraints', 'responsibleParty']

for index, column in enumerate(columns_to_vectorise):
    tqdm.pandas(desc=f"Creating vector embeddings for '{column}' ({index + 1}/{len(columns_to_vectorise)})")
    df_kartverket[column + '_vector'] = df_kartverket[column].progress_apply(compute_embeddings)

model_embedding_dimension = len(df_kartverket[columns_to_vectorise[0] + '_vector'][0])
print(f"dimensions of vectors: {model_embedding_dimension}")

df_kartverket.to_csv(chosen_model.replace('/', '-') + f'_{model_embedding_dimension}' + '.csv', index=False, sep='|')