In [None]:
#! pip install --upgrade pip
#! pip3 install -q towhee pymilvus==2.2.11
#! pip3 uninstall pymilvus -y

! pip3 install -q towhee pymilvus==2.1.1
! pip3 install transformers -q
! pip3 install pandas -q
! pip3 install tqdm -q
! pip3 show pymilvus | grep -Ei 'Name:|Version:'
! pip3 show towhee | grep -Ei 'Name:|Version:'

In [1]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility



def kartverket_create_milvus_collection(collection_name, vector_column, dim):
    if utility.has_collection(collection_name):
        utility.drop_collection(collection_name)

    fields = [
            FieldSchema(name='schema', dtype=DataType.VARCHAR, max_length=100),  
            FieldSchema(name='id', dtype=DataType.INT64, is_primary=True, auto_id=False),
            FieldSchema(name='uuid', dtype=DataType.VARCHAR, max_length=100), 
            FieldSchema(name='hierarchyLevel', dtype=DataType.VARCHAR, max_length=100),    
            #FieldSchema(name='hierarchyLevel_vector', dtype=DataType.FLOAT_VECTOR, dim=dim), 
            FieldSchema(name='title', dtype=DataType.VARCHAR, max_length=100),   
            #FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=dim),

            FieldSchema(name='datasetcreationdate', dtype=DataType.VARCHAR, max_length=500),    
            FieldSchema(name='abstract', dtype=DataType.VARCHAR, max_length=2000),   
            FieldSchema(name='abstract_vector', dtype=DataType.FLOAT_VECTOR, dim=dim),   
            FieldSchema(name='keyword', dtype=DataType.VARCHAR, max_length=2000),   
            #FieldSchema(name='keyword_vector', dtype=DataType.FLOAT_VECTOR, dim=dim),   
            FieldSchema(name='geoBox', dtype=DataType.VARCHAR, max_length=100),    
            #FieldSchema(name='geoBox_vector', dtype=DataType.FLOAT_VECTOR, dim=dim),    
            FieldSchema(name='Constraints', dtype=DataType.VARCHAR, max_length=1000),   
            #FieldSchema(name='Constraints_vector', dtype=DataType.FLOAT_VECTOR, dim=dim),   

            FieldSchema(name='SecurityConstraints', dtype=DataType.VARCHAR, max_length=500),   
            #FieldSchema(name='SecurityConstraints_vector', dtype=DataType.FLOAT_VECTOR, dim=dim),   
            FieldSchema(name='LegalConstraints', dtype=DataType.VARCHAR, max_length=2000),   
            #FieldSchema(name='LegalConstraints_vector', dtype=DataType.FLOAT_VECTOR, dim=dim),   
            FieldSchema(name='temporalExtent', dtype=DataType.VARCHAR, max_length=100),    
            ##FieldSchema(name='temporalExtent_vector', dtype=DataType.FLOAT_VECTOR, dim=dim),    
            FieldSchema(name='image', dtype=DataType.VARCHAR, max_length=1000),    
            FieldSchema(name='responsibleParty', dtype=DataType.VARCHAR, max_length=500),   
            #FieldSchema(name='responsibleParty_vector', dtype=DataType.FLOAT_VECTOR, dim=dim),   

            FieldSchema(name='link', dtype=DataType.VARCHAR, max_length=500),    
            #FieldSchema(name='metadatacreationdate', dtype=DataType.VARCHAR, max_length=500), # SUS field, encoding error?
            ##FieldSchema(name='metadatacreationdate_vector', dtype=DataType.FLOAT_VECTOR, dim=dim),    
            FieldSchema(name='productInformation', dtype=DataType.VARCHAR, max_length=1000),   
            #FieldSchema(name='productInformation_vector', dtype=DataType.FLOAT_VECTOR, dim=dim),   
            FieldSchema(name='parentId', dtype=DataType.VARCHAR, max_length=100),   


            #FieldSchema(name='sentence', dtype=DataType.VARCHAR, max_length=1000),   
            #FieldSchema(name='sentence_vector', dtype=DataType.FLOAT_VECTOR, dim=dim),   

    ]
    schema = CollectionSchema(fields=fields, description='search text')
    collection_columns = [field_schema.name for field_schema in schema.fields]
    collection = Collection(name=collection_name, schema=schema)
    
    index_params = {
        'metric_type': "L2",
        'index_type': "IVF_FLAT",
        'params': {"nlist": 2048}
    }
    collection.create_index(field_name=vector_column, index_params=index_params)
    return collection, collection_columns

In [37]:
import pandas as pd
from towhee import pipe, ops, DataCollection
from transformers import AutoTokenizer
from tqdm import tqdm
from pymilvus import list_collections, drop_collection, connections, MilvusException

def compute_embeddings(text, tokenizer, embeddings_pipe):
    MAX_TOKENS = 512 
    inputs = tokenizer(text, return_tensors="pt", max_length=MAX_TOKENS, truncation=True)
    truncated_text = tokenizer.decode(inputs["input_ids"][0])
    return DataCollection(embeddings_pipe(truncated_text)).to_list()[0]['vec']


# Loads dataset into dataframe and recasts columns into correct datatypes
# Either use cleaned_dataset, or cleaned_sentence_expanded datasets
cleaned_dataset = 'output_metadata.csv'
df_kartverket = pd.read_csv(cleaned_dataset, sep='|')
recast_to_string = ['datasetcreationdate', 'metadatacreationdate']
df_kartverket[recast_to_string] = df_kartverket[recast_to_string].astype('object')

# Fill NaN values with an empty string
df_kartverket.fillna('', inplace=True)

# Dictionary containing transformer embeddings, with their embedding dimensions
embeddings_transformer_dictionary = {
    'albert-base-v1': 768,
    'albert-base-v1': ,
    'albert-large-v1': ,
    'albert-xlarge-v1': ,
    'albert-xxlarge-v1': ,
    'albert-base-v2': ,
    'albert-large-v2': ,
    'albert-xlarge-v2': ,
    'albert-xxlarge-v2': ,

    'facebook/bart-large': ,

    'bert-base-cased'
    'bert-base-uncased': ,
    'bert-large-cased': ,
    'bert-large-uncased': ,
    'bert-base-multilingual-uncased': ,
    'bert-base-multilingual-cased': ,
    'bert-base-chinese': ,
    'bert-base-german-cased': ,
    'bert-large-uncased-whole-word-masking': ,
    'bert-large-cased-whole-word-masking': ,
    'bert-large-uncased-whole-word-masking-finetuned-squad': ,
    'bert-large-cased-whole-word-masking-finetuned-squad': ,
    'bert-base-cased-finetuned-mrpc': ,
    'bert-base-german-dbmdz-cased': ,
    'bert-base-german-dbmdz-uncased': ,
    'cl-tohoku/bert-base-japanese-whole-word-masking': ,
    'cl-tohoku/bert-base-japanese-char': ,
    'cl-tohoku/bert-base-japanese-char-whole-word-masking': ,
    'TurkuNLP/bert-base-finnish-cased-v1': ,
    'TurkuNLP/bert-base-finnish-uncased-v1': ,
    'wietsedv/bert-base-dutch-cased': ,

}


column_to_vectorise = 'abstract'
for embedding_model in embeddings_transformer_dictionary.keys():
    embedding_model_dimension = embeddings_transformer_dictionary[embedding_model]
    # creates embeddings for truncated dataframe
    df_copy = df_kartverket.iloc[0:10].copy(deep=True)

    tokenizer = AutoTokenizer.from_pretrained(embedding_model)
    embeddings_pipe = (
        pipe.input('text')
            .map('text', 'vec', ops.text_embedding.transformers(model_name=embedding_model))
            .output('vec')
    )
    tqdm.pandas(desc=f"Creating embeddings for: {embedding_model}")
    df_copy[column_to_vectorise + '_vector'] = df_copy[column_to_vectorise].progress_apply(
        lambda x: compute_embeddings(x, tokenizer, embeddings_pipe)
    )

    # Remove colleciton if existing
    server_host = 'ebjerk.no'
    server_port = '19530'
    connections.connect(host=server_host, port=server_port)

    if len(list_collections()) == 0:
        print(f'Database has no collections to remove.')
        break
    try:
        drop_collection(embedding_model)
    except MilvusException:
        print(f'Could not drop collection: {embedding_model}')


    # create collection with dimension of embedding model
    embedding_collection, collection_columns = kartverket_create_milvus_collection(embedding_model, column_to_vectorise + '_vector', embedding_model_dimension)
    
    # Create insertion pipe, and align dataframe

    # Insert dataframe into milvus collection


    # Create pipe for creation of embeddings for querying milvus database

    # Iterate over benchmark questions with correct answer

        # Convert question into embeddings, query milvus collection, record score and/or ranking of "correct answer"

        # Store the benchmark question score/ranking
    



Creating embeddings for: albert-base-v1: 100%|██████████| 10/10 [00:00<00:00, 10.77it/s]


In [None]:
embeddings_dpr_dictionary = {
    'facebook/dpr-ctx_encoder-single-nq-base': 768,
    'facebook/dpr-ctx_encoder-multiset-base': 768,
}

# Automated embeddings for dpr models
for embedding_model in embeddings_dpr_dictionary.keys():
    df_copy = df_kartverket.copy(deep=True)

    tokenizer = AutoTokenizer.from_pretrained(embedding_model)
    embeddings_pipe = (
        pipe.input('text')
            .map('text', 'vec', ops.text_embedding.dpr(model_name=embedding_model))
            .output('vec')
    )
    tqdm.pandas(desc="Converting to vector embeddings")
    df_copy[column_to_vectorise + '_vector'] = df_copy[column_to_vectorise].progress_apply(compute_embeddings)

# Process each column and create new columns for embeddings
tqdm.pandas(desc="Converting to vector embeddings")
df_kartverket.head()