In [1]:
%load_ext autoreload
%autoreload 2
from llamawrapper import LlamaHelper
import torch
import sys
sys.path.append('echo-embeddings')
from echo_embeddings import EchoEmbeddingsMistral, EchoPooling, EchoParser

In [2]:
runai = False
templates = {
    'query': '<s>Instruct:{!%%prompt%%,}\nQuery:{!%%text%%}\nQuery again:{%%text%%}{</s>}',
    'document': '<s>Document:{!%%text%%}\nDocument again:{%%text%%}{</s>}',
}
# Create the model
path_to_model = 'jspringer/echo-mistral-7b-instruct-lasttoken'
path_to_model = '/dlabscratch1/public/llm_weights/llama2_hf/Llama-2-7b-hf'
if runai:
    path_to_model = '/dlabscratch1' + path_to_model
model = EchoEmbeddingsMistral.from_pretrained(path_to_model, device_map='auto', load_in_8bit=True)
model = model.eval()

# Create the parser
parser = EchoParser(path_to_model, templates, max_length=512)

# Create the pooling: strategy can either be mean or last
pooling = EchoPooling(strategy='mean')

# specify the prompt, queries, and documents
prompt = 'Retrieve passages that answer the question'
queries = [
    'What is the capital of France?',
    'What is the capital of Deutschland?',
]
documents = [
    'Paris is the capital of France.',
    'Berlin is the capital of Germany.',
]

query_variables = [{'prompt': prompt, 'text': q} for q in queries]
document_variables = [{'text': d} for d in documents]

query_tagged = [('query', q) for q in query_variables]
document_tagged = [('document', d) for d in document_variables]

# Get the tokenized embeddings
with torch.no_grad():
    query_embeddings = pooling(model(parser(query_tagged)))['sentence_embedding']
    document_embeddings = pooling(model(parser(document_tagged)))['sentence_embedding']


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
query_embeddings /= query_embeddings.norm(dim=-1, keepdim=True)
document_embeddings /= document_embeddings.norm(dim=-1, keepdim=True)
print(query_embeddings.shape)
print(query_embeddings @ document_embeddings.T)

torch.Size([2, 4096])
tensor([[0.8921, 0.8638],
        [0.7812, 0.8169]], device='cuda:0', dtype=torch.float16)


In [4]:
class MyModel():
    def __init__(self, model, parser, pooling):
        self.model = model
        self.parser = parser
        self.pooling = pooling

    def encode_queries(self, queries, batch_size=32, **kwargs):
        """
        Returns a list of embeddings for the given sentences.
        Args:
            queries (`List[str]`): List of sentences to encode
            batch_size (`int`): Batch size for the encoding

        Returns:
            `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences
        """
        model = self.model
        parser = self.parser
        pooling = self.pooling

        query_variables = [{'prompt': prompt, 'text': q} for q in queries]

        query_tagged = [('query', q) for q in query_variables]

        # Get the tokenized embeddings
        with torch.no_grad():
            query_embeddings = pooling(model(parser(query_tagged)))['sentence_embedding']
        return query_embeddings.detach().cpu()

    def encode_corpus(self, corpus, batch_size=32, **kwargs):
        """
        Returns a list of embeddings for the given sentences.
        Args:
            corpus (`List[str]` or `List[Dict[str, str]]`): List of sentences to encode
                or list of dictionaries with keys "title" and "text"
            batch_size (`int`): Batch size for the encoding

        Returns:
            `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences
        """
        model = self.model
        parser = self.parser
        pooling = self.pooling
        document_variables = [{'text': d} for d in corpus]

        document_tagged = [('document', d) for d in document_variables]

        # Get the tokenized embeddings
        with torch.no_grad():
            document_embeddings = pooling(model(parser(document_tagged)))['sentence_embedding']
        return document_embeddings.detach().cpu()
    
    def encode(self, sentences, batch_size=32, **kwargs):
        """
        Returns a list of embeddings for the given sentences.
        Args:
            sentences (`List[str]`): List of sentences to encode
            batch_size (`int`): Batch size for the encoding

        Returns:
            `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences
        """
        return self.encode_corpus(sentences, batch_size=batch_size, **kwargs)

In [5]:
from mteb import MTEB
mymodel = MyModel(model, parser, pooling)
evaluation = MTEB(tasks=["EmotionClassification"])
evaluation.run(mymodel, batch_size=32)

Error while evaluating EmotionClassification: CUDA out of memory. Tried to allocate 4.49 GiB (GPU 0; 11.93 GiB total capacity; 9.78 GiB already allocated; 1.35 GiB free; 10.00 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.49 GiB (GPU 0; 11.93 GiB total capacity; 9.78 GiB already allocated; 1.35 GiB free; 10.00 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF