In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
import torch.optim as optim
import spacy

In [5]:
# Load and preprocess your CSV data
df = pd.read_csv("training_data.csv")
texts1 = df['Text1'].tolist()
texts2 = df['Text2'].tolist()
similarities = df['Similarity'].tolist()

In [10]:
# Load Sentence Transformer model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Load SpaCy dependency parser
nlp = spacy.load("en_core_web_sm")

In [14]:
class TextSimilarityModel(nn.Module):
    def __init__(self, input_size):
        super(TextSimilarityModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 1)
        
    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [23]:
def combine_features(embeddings, dep_features):
    combined = torch.cat((embeddings, dep_features), dim=0)  # Concatenate along rows (dimension 0)
    return combined

In [24]:
# Create your model instance
input_size = model.get_sentence_embedding_dimension() + 21  # Add size of dependency features
similarity_model = TextSimilarityModel(input_size)

# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(similarity_model.parameters(), lr=0.001)

In [25]:
num_epochs = 10  # You can adjust this
for epoch in range(num_epochs):
    for text1, text2, similarity in zip(texts1, texts2, similarities):
        # Preprocess text and perform dependency parsing with SpaCy
        doc1 = nlp(text1)
        doc2 = nlp(text2)
        
        num_unique_relations = 21

        # Create a dictionary to map dependency relations to unique indices
        dependency_index_map = {'nsubj': 0, 'dobj': 1, 'prep': 2}  # Add more if needed

        # Extract relevant features from doc1's dependency parse
        dep_features1 = torch.zeros(num_unique_relations)  # Initialize with zeros
        for token in doc1:
            if token.dep_ in dependency_index_map:  # Check if the dependency relation is relevant
                index = dependency_index_map[token.dep_]  # Get the unique index for the relation
                dep_features1[index] = 1  # Set the corresponding index to 1

        # Extract relevant features from doc2's dependency parse
        dep_features2 = torch.zeros(num_unique_relations)  # Initialize with zeros
        for token in doc2:
            if token.dep_ in dependency_index_map:  # Check if the dependency relation is relevant
                index = dependency_index_map[token.dep_]  # Get the unique index for the relation
                dep_features2[index] = 1  # Set the corresponding index to 1


        # Convert NumPy arrays to PyTorch tensors
        dep_features1 = torch.tensor(dep_features1, dtype=torch.float32)
        dep_features2 = torch.tensor(dep_features2, dtype=torch.float32)

        # Convert sentence embeddings to PyTorch tensors
        embeddings1 = torch.tensor(embeddings1, dtype=torch.float32)
        embeddings2 = torch.tensor(embeddings2, dtype=torch.float32)

        # Combine features
        combined_features1 = combine_features(embeddings1, dep_features1)
        combined_features2 = combine_features(embeddings2, dep_features2)


        # Forward pass
        output = similarity_model(combined_features1 - combined_features2)

        # Calculate loss
        loss = criterion(output, torch.tensor(similarity, dtype=torch.float32))

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Save the trained model
torch.save(similarity_model.state_dict(), "./third_party/fine_tuned_model_2")

  dep_features1 = torch.tensor(dep_features1, dtype=torch.float32)
  dep_features2 = torch.tensor(dep_features2, dtype=torch.float32)
  embeddings1 = torch.tensor(embeddings1, dtype=torch.float32)
  embeddings2 = torch.tensor(embeddings2, dtype=torch.float32)
  return F.mse_loss(input, target, reduction=self.reduction)


In [27]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
import torch.optim as optim
import spacy

# Load and preprocess your CSV data
df = pd.read_csv("training_data.csv")
texts1 = df['Text1'].tolist()
texts2 = df['Text2'].tolist()
similarities = df['Similarity'].tolist()

# Load Sentence Transformer model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Load SpaCy dependency parser
nlp = spacy.load("en_core_web_sm")

class TextSimilarityModel(nn.Module):
    def __init__(self, input_size):
        super(TextSimilarityModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def combine_features(embeddings, dep_features):
    combined = torch.cat((embeddings, dep_features), dim=0)  # Concatenate along columns (dimension 1)
    return combined

# Create your model instance
input_size = model.get_sentence_embedding_dimension() + 21  # Add size of dependency features
similarity_model = TextSimilarityModel(input_size)

# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(similarity_model.parameters(), lr=0.001)

num_epochs = 10  # You can adjust this
for epoch in range(num_epochs):
    for text1, text2, similarity in zip(texts1, texts2, similarities):
        # Preprocess text and perform dependency parsing with SpaCy
        doc1 = nlp(text1)
        doc2 = nlp(text2)
        
        num_unique_relations = 21

        # Create a dictionary to map dependency relations to unique indices
        dependency_index_map = {'nsubj': 0, 'dobj': 1, 'prep': 2}  # Add more if needed

        # Extract relevant features from doc1's dependency parse
        dep_features1 = torch.zeros(num_unique_relations)  # Initialize with zeros
        for token in doc1:
            if token.dep_ in dependency_index_map:  # Check if the dependency relation is relevant
                index = dependency_index_map[token.dep_]  # Get the unique index for the relation
                dep_features1[index] = 1  # Set the corresponding index to 1

        # Extract relevant features from doc2's dependency parse
        dep_features2 = torch.zeros(num_unique_relations)  # Initialize with zeros
        for token in doc2:
            if token.dep_ in dependency_index_map:  # Check if the dependency relation is relevant
                index = dependency_index_map[token.dep_]  # Get the unique index for the relation
                dep_features2[index] = 1  # Set the corresponding index to 1


        # Convert NumPy arrays to PyTorch tensors
        dep_features1 = torch.tensor(dep_features1, dtype=torch.float32)
        dep_features2 = torch.tensor(dep_features2, dtype=torch.float32)

        # Convert sentence embeddings to PyTorch tensors
        embeddings1 = torch.tensor(embeddings1, dtype=torch.float32)
        embeddings2 = torch.tensor(embeddings2, dtype=torch.float32)

        # Combine features
        combined_features1 = combine_features(embeddings1, dep_features1)
        combined_features2 = combine_features(embeddings2, dep_features2)


        # Forward pass
        output = similarity_model(combined_features1 - combined_features2)

        # Calculate loss
        loss = criterion(output, torch.tensor(similarity, dtype=torch.float32))

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Save the trained model
torch.save(similarity_model.state_dict(), "./third_party/fine_tuned_model_2")


  dep_features1 = torch.tensor(dep_features1, dtype=torch.float32)
  dep_features2 = torch.tensor(dep_features2, dtype=torch.float32)
  embeddings1 = torch.tensor(embeddings1, dtype=torch.float32)
  embeddings2 = torch.tensor(embeddings2, dtype=torch.float32)
  return F.mse_loss(input, target, reduction=self.reduction)


In [30]:
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
import spacy

# Load Sentence Transformer model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Load SpaCy dependency parser
nlp = spacy.load("en_core_web_sm")

class TextSimilarityModel(nn.Module):
    def __init__(self, input_size):
        super(TextSimilarityModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def combine_features(embeddings, dep_features):
    embeddings = torch.tensor(embeddings, dtype=torch.float32)
    dep_features = torch.tensor(dep_features, dtype=torch.float32)
    combined = torch.cat((embeddings, dep_features), dim=0)  # Concatenate along columns (dimension 1)
    return combined

# Create your model instance
input_size = model.get_sentence_embedding_dimension() + 21  # Add size of dependency features
similarity_model = TextSimilarityModel(input_size)

# Load the saved model's state dictionary
saved_model_path = "./third_party/fine_tuned_model_2"
similarity_model.load_state_dict(torch.load(saved_model_path))

# Set the model to evaluation mode
similarity_model.eval()

# Example text pairs for testing
text1 = "Recurrent Neural Networks (RNNs) are designed to process sequential data."
text2 = "RNNs are used for processing sequential data."

# Preprocess text and perform dependency parsing with SpaCy
doc1 = nlp(text1)
doc2 = nlp(text2)

# Create a dictionary to map dependency relations to unique indices
dependency_index_map = {'nsubj': 0, 'dobj': 1, 'prep': 2}  # Add more if needed

        # Extract relevant features from doc1's dependency parse
dep_features1 = torch.zeros(num_unique_relations)  # Initialize with zeros
for token in doc1:
    if token.dep_ in dependency_index_map:  # Check if the dependency relation is relevant
        index = dependency_index_map[token.dep_]  # Get the unique index for the relation
        dep_features1[index] = 1  # Set the corresponding index to 1

# Extract relevant features from doc2's dependency parse
dep_features2 = torch.zeros(num_unique_relations)  # Initialize with zeros
for token in doc2:
    if token.dep_ in dependency_index_map:  # Check if the dependency relation is relevant
        index = dependency_index_map[token.dep_]  # Get the unique index for the relation
        dep_features2[index] = 1  # Set the corresponding index to 1


# Encode sentence embeddings
embeddings1 = model.encode([text1])[0]
embeddings2 = model.encode([text2])[0]

# Combine features
combined_features1 = combine_features(embeddings1, dep_features1)
combined_features2 = combine_features(embeddings2, dep_features2)

# Forward pass
output = similarity_model(combined_features1 - combined_features2)

# Print the predicted similarity score
print("Predicted Similarity:", output.item())


Predicted Similarity: 4.39482307434082


  dep_features = torch.tensor(dep_features, dtype=torch.float32)
