Docs:
- https://huggingface.co/onnx-models/sentence-t5-base-onnx
- http://huggingface.co/onnx-models/all-MiniLM-L6-v2-onnx # 0.08GB
- https://huggingface.co/spaces/mteb/leaderboard
- https://huggingface.co/docs/transformers/en/serialization
- https://huggingface.co/docs/optimum-onnx/onnx/usage_guides/contribute

### 0. Import Data

In [None]:
import re
import os

def filter_EN_content(files: [str]):
    substrings = ['FR', 'SP', 'RU', 'JP', 'ES', 'PT', 'Ã©', r'(\d)', 'jpn', 'Portugese', 'MX', 'Spanish', 'JA', 'GHI', 'ST']
    pattern = re.compile('|'.join(substrings))
    valid_files = [file for file in files if not pattern.search(file)]
    return valid_files

class Document:
    
    def __init__(self, i: int, word_file_path: str, guided_content_type: str):
        self.index = i
        self.word_file_path = word_file_path
        self.guided_content_type = guided_content_type
        self.title = os.path.splitext(os.path.basename(word_file_path))[0]
        # print(self.title)
        self.content = self._read_word_document()
        
    def _read_word_document(self):
        content = []
        with open(self.word_file_path, 'r') as f:
            for line in f.readlines():
                l = re.sub('[!~@#$\-*<>]', '', line).strip()
                if l != '':
                    content.append(l)
        return content

In [None]:
# --- read meditations files
meditations_dir = "/Users/emulie/Data/Meditations_CLEAN"
meditations_docs = [Document(i, os.path.join(meditations_dir, word_file), 'meditations') for i, word_file in enumerate(filter_EN_content(os.listdir(meditations_dir)))]

# --- read sleeptales files
sleeptales_dir = "/Users/emulie/Data/SleepTales_CLEAN"
sleeptales_docs = [Document(i, os.path.join(sleeptales_dir, word_file), 'sleeptales') for i, word_file in enumerate(filter_EN_content(os.listdir(sleeptales_dir)))]

docs = meditations_docs + sleeptales_docs

### 1. Vectorize document

ONNX model are sentence vectorizer, not document vectorizer. To vectorize the document, we can try different approach
- Approach 0: mean of all sentences (naive)
- Approach 1: Weighted average of sentences importance using TFIDF
- Approach 2: average paragraph embedding

In [None]:
from typing import Any, Callable




In [None]:
from light_embed import TextEmbedding
sentences = [
    "This is an example sentence",
    "Each sentence is converted"
]

model0 = TextEmbedding('sentence-transformers/sentence-t5-base')
embeddings = model0.encode(sentences)
print(embeddings)

#### t2

In [None]:
from light_embed import TextEmbedding
sentences = [
    "This is an example sentence",
    "Each sentence is converted"
]

# model = TextEmbedding('onnx-models/sentence-t5-base-onnx')
model = TextEmbedding('onnx-models/all-MiniLM-L6-v2-onnx')

embeddings = model.encode(sentences)

Model saved in `~/.cache/light_embed/onnx-models/`

In [None]:
embeddings = model.encode(sleeptales_docs[0].content)

In [None]:
embeddings.shape

#### t3

In [None]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(embeddings)


### 2. Visualize Embedding