<a href="https://colab.research.google.com/github/tanushreevijay/Internship/blob/main/internship.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ Install necessary libraries
!pip install datasets transformers torch accelerate

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding
import torch
from torch.utils.data import DataLoader
from torch.amp import autocast, GradScaler
import torch.nn.functional as F

# ✅ Load Dataset
dataset = load_dataset("BeIR/fiqa", "corpus")["corpus"]

# ✅ Load Tokenizer and Model
model_name = "BAAI/bge-large-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to("cuda")

# ✅ Tokenization Function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# ✅ Tokenize Dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# ✅ Ensure dataset format
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# ✅ Use Data Collator for Proper Batching
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="pt")

# ✅ Create DataLoader
batch_size = 2  # Adjust if needed
train_dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

# ✅ Initialize Mixed Precision Training
scaler = GradScaler()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
accumulation_steps = 4  # Accumulate gradients to prevent OOM

# ✅ Training Loop
model.train()
for step, batch in enumerate(train_dataloader):
    batch = {k: v.to("cuda") for k, v in batch.items()}  # ✅ Convert batch to tensors

    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]

    with autocast(device_type="cuda"):  # ✅ FIXED autocast (correct syntax)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state[:, 0, :]

        # ✅ Use a valid loss function (Mean Squared Error in this case)
        loss = F.mse_loss(embeddings, torch.zeros_like(embeddings))

    scaler.scale(loss).backward()  # ✅ Scale gradients

    if (step + 1) % accumulation_steps == 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

    if (step + 1) % 100 == 0:
        print(f"Step {step+1}: Loss = {loss.item()}")

print("✅ Training Complete!")


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

fiqa.py:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/28.4M [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/57638 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Map:   0%|          | 0/57638 [00:00<?, ? examples/s]

Step 100: Loss = 0.06357988715171814
Step 200: Loss = 0.0527159683406353
Step 300: Loss = 0.04801846295595169
Step 400: Loss = 0.04379504546523094
Step 500: Loss = 0.04294653981924057
Step 600: Loss = 0.04157979041337967
Step 700: Loss = 0.04071632772684097
Step 800: Loss = 0.04068969562649727
Step 900: Loss = 0.039328258484601974
Step 1000: Loss = 0.03942283242940903
Step 1100: Loss = 0.038764555007219315
Step 1200: Loss = 0.038210179656744
Step 1300: Loss = 0.037982210516929626
Step 1400: Loss = 0.03786027431488037
Step 1500: Loss = 0.03764305263757706
Step 1600: Loss = 0.036980681121349335
Step 1700: Loss = 0.03709299862384796
Step 1800: Loss = 0.03667840361595154
Step 1900: Loss = 0.03640896454453468
Step 2000: Loss = 0.03593192249536514
Step 2100: Loss = 0.03569364547729492
Step 2200: Loss = 0.035657357424497604
Step 2300: Loss = 0.035284120589494705
Step 2400: Loss = 0.034908805042505264
Step 2500: Loss = 0.03476135805249214
Step 2600: Loss = 0.03459102660417557
Step 2700: Loss =

In [None]:
model.save_pretrained("bge_large_finetuned")
tokenizer.save_pretrained("bge_large_finetuned")


('bge_large_finetuned/tokenizer_config.json',
 'bge_large_finetuned/special_tokens_map.json',
 'bge_large_finetuned/vocab.txt',
 'bge_large_finetuned/added_tokens.json',
 'bge_large_finetuned/tokenizer.json')

In [None]:
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("bge_large_finetuned").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("bge_large_finetuned")

text = "What are the best investment strategies?"
inputs = tokenizer(text, return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :]

print(embedding.shape)  # Should be (1, hidden_dim)


torch.Size([1, 1024])


In [2]:
# ✅ Reload Libraries
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import torch.nn.functional as F

# ✅ Load Tokenizer & Model Again (If Runtime Disconnected)
model_name = "BAAI/bge-large-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to("cuda")
model.eval()  # Set to evaluation mode

# ✅ Function to Get Embeddings
def get_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
    with torch.no_grad():
        output = model(**tokens).last_hidden_state[:, 0, :]
    return output.cpu().numpy()

# ✅ Example Test Texts
text1 = "What are the stock market trends for 2024?"
text2 = "Predicting financial trends using AI models."
text3 = "How to bake a chocolate cake?"  # Unrelated text

# ✅ Generate Embeddings
embedding1 = get_embedding(text1)
embedding2 = get_embedding(text2)
embedding3 = get_embedding(text3)

# ✅ Compute Cosine Similarity
sim1 = cosine_similarity(embedding1, embedding2)[0][0]  # Expected high similarity
sim2 = cosine_similarity(embedding1, embedding3)[0][0]  # Expected low similarity

print(f"Similarity between related texts: {sim1:.4f}")  # Should be close to 1
print(f"Similarity between unrelated texts: {sim2:.4f}")  # Should be close to 0

# ✅ Compute MSE Loss
true_embedding = torch.zeros_like(torch.tensor(embedding1))  # Dummy zero-vector
mse_loss = F.mse_loss(torch.tensor(embedding1), true_embedding)
print(f"MSE Loss: {mse_loss.item():.4f}")  # Lower is better

print("✅ Evaluation Complete!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Similarity between related texts: 0.7879
Similarity between unrelated texts: 0.5912
MSE Loss: 0.3939
✅ Evaluation Complete!


In [3]:
from sklearn.metrics import precision_recall_fscore_support

# Dummy test dataset (1 = relevant, 0 = irrelevant)
y_true = [1, 1, 0, 1, 0, 0, 1]  # Ground truth labels
y_pred = [1, 1, 0, 1, 0, 1, 1]  # Model predictions (Example)

precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Precision: 0.8000
Recall: 1.0000
F1 Score: 0.8889
