# Python Code Description: `save_similar_sentences` Function

The updated Python code defines a function `save_similar_sentences` that processes a DataFrame to find semantically similar sentences and saves these groups into separate CSV files within a specifically named folder.

## Function: `save_similar_sentences`

### Parameters
- **dataframe_path**: A string representing the path to the CSV file that contains the DataFrame.
- **model_name** (optional): A string specifying the pre-trained Sentence Transformer model to use, defaulting to `'all-MiniLM-L6-v2'`.
- **threshold** (optional): A float representing the similarity threshold for considering sentences as similar, defaulting to `0.60`.

### Functionality
1. **Model Loading**:
   - Loads the Sentence Transformer model specified by `model_name`.

2. **DataFrame Loading and Validation**:
   - Reads the CSV file from `dataframe_path` into a DataFrame.
   - Validates that the DataFrame is not empty, raising a `ValueError` if it is.

3. **DataFrame Column Renaming**:
   - Renames the column 'ratingName' in the DataFrame to 'rating_value'.

4. **Sentence Embeddings**:
   - Computes embeddings for all sentences in the DataFrame using the loaded model.

5. **Nested Function: Finding Similar Sentences**:
   - A function to compute cosine similarity scores between a target embedding and all embeddings, returning indices of sentences that meet the similarity threshold.

6. **Folder Creation**:
   - Extracts the base name of the DataFrame file, removes its file extension, and creates a folder with this name if it does not already exist.

7. **Sentence Processing**:
   - Iterates over each sentence in the DataFrame.
   - For each sentence, finds similar sentences based on the computed embeddings and the similarity threshold.
   - Groups the original sentence and its similar sentences, and saves this group into a CSV file inside the created folder.
   - Ensures sentences are not processed multiple times.

### Output
- The function outputs CSV files in a folder named after the DataFrame. Each file contains a group of semantically similar sentences.

### Example Usage
```python
save_similar_sentences('14_mask_masks_wear_wearing.csv')


Version 1: 

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
import os

def save_similar_sentences(dataframe_path, model_name='all-MiniLM-L6-v2', threshold=0.60):
    # Load the Sentence Transformer model
    model = SentenceTransformer(model_name)

    # Load DataFrame
    df = pd.read_csv(dataframe_path)

    # Ensure the DataFrame is not empty
    if df.empty:
        raise ValueError("The DataFrame is empty.")

    # Rename 'ratingName' column to 'rating_value'
    df.rename(columns={'ratingName': 'rating_value'}, inplace=True)

    # Pre-compute embeddings for all sentences
    all_sentences = df['text'].tolist()
    all_embeddings = model.encode(all_sentences, convert_to_tensor=True)

    # Function to find semantically similar sentences
    def find_similar_sentences(target_embedding, threshold=0.60):
        cos_scores = util.cos_sim(target_embedding, all_embeddings)[0]
        similar_indices = [i for i in cos_scores.argsort(descending=True).tolist() 
                           if cos_scores[i] >= threshold and cos_scores[i] < 0.99]
        return similar_indices

    # Create a folder named after the DataFrame file
    folder_name = os.path.splitext(os.path.basename(dataframe_path))[0]
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Track processed sentences to avoid duplication
    processed_sentences = set()

    # File counter for naming
    file_counter = 1

    # Iterate over sentences
    for i, row in df.iterrows():
        if i in processed_sentences:
            continue

        target_embedding = all_embeddings[i]
        similar_indices = find_similar_sentences(target_embedding)

        # Select rows from DataFrame
        similar_rows = df.iloc[[i] + similar_indices]
        
        # Save to CSV in the created folder
        filename = os.path.join(folder_name, f'narrative_{file_counter}.csv')
        similar_rows.to_csv(filename, index=False)
        file_counter += 1

        # Mark as processed
        processed_sentences.update([i] + similar_indices)

# Example usage
# save_similar_sentences('14_mask_masks_wear_wearing.csv')


In [2]:
save_similar_sentences('14_mask_masks_wear_wearing.csv')


Version 2: 

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
import os
import glob

def save_similar_sentences(dataframe_path, narratives_root_folder, model_name='all-MiniLM-L6-v2', threshold=0.60):
    # Load the Sentence Transformer model
    model = SentenceTransformer(model_name)

    # Load DataFrame
    df = pd.read_csv(dataframe_path)

    # Ensure the DataFrame is not empty
    if df.empty:
        raise ValueError("The DataFrame is empty.")

    # Rename 'ratingName' column to 'rating_value'
    df.rename(columns={'ratingName': 'rating_value'}, inplace=True)

    # Pre-compute embeddings for all sentences
    all_sentences = df['text'].tolist()
    all_embeddings = model.encode(all_sentences, convert_to_tensor=True)

    # Determine the specific output folder for this CSV file, placing "Narratives" at the start
    base_filename = os.path.splitext(os.path.basename(dataframe_path))[0]
    output_folder = os.path.join(narratives_root_folder, f'Narratives_{base_filename}')
    
    # Create the output folder if it does not exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Function to find semantically similar sentences
    def find_similar_sentences(target_embedding, threshold=0.60):
        cos_scores = util.cos_sim(target_embedding, all_embeddings)[0]
        similar_indices = [i for i in cos_scores.argsort(descending=True).tolist() 
                           if cos_scores[i] >= threshold and cos_scores[i] < 0.99]
        return similar_indices

    # Track processed sentences to avoid duplication
    processed_sentences = set()

    # File counter for naming
    file_counter = 1

    # Iterate over sentences
    for i, row in df.iterrows():
        if i in processed_sentences:
            continue

        target_embedding = all_embeddings[i]
        similar_indices = find_similar_sentences(target_embedding)

        # Select rows from DataFrame
        similar_rows = df.iloc[[i] + similar_indices]

        # Save to CSV in the specific output folder, with "Narratives" at the start
        filename = os.path.join(output_folder, f'Narratives_{base_filename}_{file_counter}.csv')
        similar_rows.to_csv(filename, index=False)
        file_counter += 1

        # Mark as processed
        processed_sentences.update([i] + similar_indices)

def process_folders(start=1, end=2):
    for i in range(start, end + 1):
        folder_name = f'BERTopic_run_{i}'
        narratives_root_folder = os.path.join(folder_name, 'Narratives Results')
        
        # Ensure the root narratives folder exists
        if not os.path.exists(narratives_root_folder):
            os.makedirs(narratives_root_folder)
        
        csv_files = glob.glob(os.path.join(folder_name, f'BERTopic_run_{i}_Topic_*.csv'))
        
        for csv_file in csv_files:
            save_similar_sentences(csv_file, narratives_root_folder)

# Example usage: Process folders BERTopic_run_1 to BERTopic_run_3
process_folders(start=1, end=1)


In [3]:
# Example usage: Process folders BERTopic_run_1 to BERTopic_run_n
# Adjust 'start' and 'end' according to your needs
process_folders(start=1, end=1)
