In [1]:
!pip install -U sentence-transformers datasets torch nltk

Collecting datasets
  Downloading datasets-4.2.0-py3-none-any.whl.metadata (18 kB)
Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading datasets-4.2.0-py3-none-any.whl (506 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.3/506.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (42.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, nltk, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-1

In [3]:
import os
import nltk
from sentence_transformers import InputExample
from torch.utils.data import DataLoader

nltk.download('punkt_tab')  # For sentence tokenization

recipe_folder = '/content/recipes'

# Load all recipe texts
docs = []
for filename in os.listdir(recipe_folder):
    if filename.endswith('.txt'):
        with open(os.path.join(recipe_folder, filename), 'r') as f:
            docs.append(f.read())

# Split into sentences and create positive pairs (consecutive as similar)
train_examples = []
for doc in docs:
    sentences = nltk.sent_tokenize(doc)  # Split into sentences
    for i in range(len(sentences) - 1):
        train_examples.append(InputExample(texts=[sentences[i], sentences[i+1]]))

# DataLoader (batch_size=16, GPU-friendly)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
print(f"Created {len(train_examples)} positive pairs for fine-tuning.")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Created 100 positive pairs for fine-tuning.


In [5]:
from sentence_transformers import SentenceTransformer, losses

model_id = 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer(model_id, device='cuda')

train_loss = losses.MultipleNegativesRankingLoss(model=model)

# Fine-tune
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=50)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


In [6]:
save_path = '/content/fine_tuned_embeddings'
model.save(save_path)

# Zip and download
!zip -r fine_tuned_embeddings.zip {save_path}
from google.colab import files
files.download('fine_tuned_embeddings.zip')

  adding: content/fine_tuned_embeddings/ (stored 0%)
  adding: content/fine_tuned_embeddings/sentence_bert_config.json (deflated 9%)
  adding: content/fine_tuned_embeddings/config.json (deflated 47%)
  adding: content/fine_tuned_embeddings/config_sentence_transformers.json (deflated 40%)
  adding: content/fine_tuned_embeddings/special_tokens_map.json (deflated 80%)
  adding: content/fine_tuned_embeddings/README.md (deflated 67%)
  adding: content/fine_tuned_embeddings/modules.json (deflated 62%)
  adding: content/fine_tuned_embeddings/1_Pooling/ (stored 0%)
  adding: content/fine_tuned_embeddings/1_Pooling/config.json (deflated 59%)
  adding: content/fine_tuned_embeddings/2_Normalize/ (stored 0%)
  adding: content/fine_tuned_embeddings/model.safetensors (deflated 8%)
  adding: content/fine_tuned_embeddings/vocab.txt (deflated 53%)
  adding: content/fine_tuned_embeddings/tokenizer.json (deflated 71%)
  adding: content/fine_tuned_embeddings/tokenizer_config.json (deflated 73%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>