<a href="https://colab.research.google.com/github/sumanthkumar113/Semantic-Textual-Similarity/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Define the custom dataset class
class ParagraphDataset(Dataset):
    def __init__(self, df):
        self.paragraphs1 = df['text1'].tolist()
        self.paragraphs2 = df['text2'].tolist()
        self.labels = df['similarity'].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.paragraphs1[idx], self.paragraphs2[idx], self.labels[idx]

# Load the pre-trained sentence transformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Load the dataset from the CSV file
def load_data(file_path):
    df = pd.read_csv(file_path)
    df = df.dropna()  # Drop rows with NaN values
    return df


# Calculate the similarity score between two paragraphs
def calculate_similarity(paragraph1, paragraph2):
    with torch.no_grad():
        embeddings = model.encode([paragraph1, paragraph2])
        embeddings_tensor = torch.from_numpy(embeddings)  # Convert NumPy array to PyTorch tensor

    embeddings_tensor.requires_grad = True  # Enable gradients for the tensor

    similarity_score = torch.nn.functional.cosine_similarity(embeddings_tensor[0].unsqueeze(0), embeddings_tensor[1].unsqueeze(0)).item()
    return similarity_score


# Main function
def main():
    file_path = '/content/Precily_Text_Similarity.csv'  # Path to the input CSV file

    df = load_data(file_path)

    # Split the dataset into train and test sets
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Create train and test datasets
    train_dataset = ParagraphDataset(train_df)
    test_dataset = ParagraphDataset(test_df)

    # Define model parameters
    num_epochs = 3
    batch_size = 8
    learning_rate = 2e-5

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Move the model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Define optimizer and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # Model training
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
        for paragraphs1, paragraphs2, labels in progress_bar:
            paragraphs1 = list(paragraphs1)
            paragraphs2 = list(paragraphs2)
            labels = labels.to(device)

            # Calculate similarity scores
            similarities = []
            for p1, p2 in zip(paragraphs1, paragraphs2):
                similarity_score = calculate_similarity(p1, p2)
                similarities.append(similarity_score)

            similarities = torch.tensor(similarities, device=device)

            # Calculate loss and perform backpropagation
            # Calculate loss and perform backpropagation
            loss = torch.nn.functional.mse_loss(similarities, labels)
            loss.requires_grad = True  # Enable gradients for the loss tensor
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()


            total_loss += loss.item()

            progress_bar.set_postfix({'loss': total_loss / len(train_loader)})

    # Model evaluation on test set
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for paragraphs1, paragraphs2, labels in test_loader:
            paragraphs1 = list(paragraphs1)
            paragraphs2 = list(paragraphs2)
            labels = labels.tolist()
            true_labels.extend(labels)

            similarities = []
            for p1, p2 in zip(paragraphs1, paragraphs2):
                similarity_score = calculate_similarity(p1, p2)
                similarities.append(similarity_score)

            predictions.extend(similarities)

    # Calculate evaluation metrics
    mse = mean_squared_error(true_labels, predictions)
    print(f"Mean Squared Error: {mse}")




In [None]:
import tensorflow as tf


In [None]:
model = model.train()

# Save the entire model as a `.keras` zip archive.
model.save('my_model.keras')

In [None]:
model.summary()

AttributeError: ignored