In [6]:
"""Multi-Source Embedding Analysis

This code supports the Katrina case analysis provided in Arbel & Hoffman, Generative Interpretation

Note, the model is best run in a high-ram enviornment, possibly using a GPU. We used 250 GB of RAM, thanks to the University of Alabama High Power Compute Center

The scripts creates embeddings for a context sentence and a number of reference sentences, measure relative distances, normalizes them between 0-1, and presents results.
We are using the top 10 embedding models on the HuggingFace MTEB Leaderboard for STS tasks as they were on 10.1.23
(The leading model--Sionic v2--doesn't currently have an API)

"""

"Multi-Source Embedding Analysis\n\nThis code supports the Katrina case analysis provided in Arbel & Hoffman, Generative Interpretation\n\nNote, the model is best run in a high-ram enviornment, possibly using a GPU. We used 250 GB of RAM, thanks to the University of Alabama High Power Compute Center\n\nThe scripts creates embeddings for a context sentence and a number of reference sentences, measure relative distances, normalizes them between 0-1, and presents results.\nWe are using the top 10 embedding models on the HuggingFace MTEB Leaderboard for STS tasks as they were on 10.1.23\n(The leading model--Sionic v2--doesn't currently have an API)\n\n"

In [9]:

import numpy as np
import pandas as pd
import requests
import pickle
import os
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from InstructorEmbedding import INSTRUCTOR

import gc
import torch

os.environ['TRANSFORMERS_CACHE'] = '/scratch/yaarbel'



1
1
1


In [10]:
def cos_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def fetch_embeddings(inputs):
    response = requests.post(
        'https://api.sionic.ai/v1/embedding',
        headers={"Content-Type": "application/json"},
        json={"inputs": inputs}
    )
    if response.status_code == 200:
        return np.array(response.json()['embedding'])
    else:
        print("Error:", response.content)
        return None

def calculate_similarities(all_embeddings, sentences, context):
    context_embedding = all_embeddings[0]
    sentence_embeddings = all_embeddings[1:]
    sim_matrix = cosine_similarity([context_embedding], sentence_embeddings)
    context_sim_scores = sim_matrix[0]
    
    # Dynamically set unwanted terms to the current context
    unwanted_terms = {context}
    print(f"Unwanted terms: {unwanted_terms}")  # Debugging
    
    filtered_sentences = [s for s in sentences if s not in unwanted_terms]
    print(f"Filtered sentences: {filtered_sentences}")  # Debugging
    
    filtered_scores = np.array([score for s, score in zip(sentences, context_sim_scores) if s not in unwanted_terms])
    
    # Normalize the filtered_scores
    min_val, max_val = np.min(filtered_scores), np.max(filtered_scores)
    normalized_scores = (filtered_scores - min_val) / (max_val - min_val)
    
    # Create DataFrame with both Raw and Normalized Similarity
    df = pd.DataFrame({
        'Term': filtered_sentences, 
        'Normalized Similarity': normalized_scores, 
        'Raw Similarity': filtered_scores
    })
    
    # Sort by Normalized Similarity
    df = df.sort_values(by='Normalized Similarity', ascending=False).reset_index(drop=True)
    
    # Add a Rank column
    df['Rank'] = df.index + 1
    
    return df



def save_embeddings(model_name, embeddings):
    directory = os.path.join(config['output_directory_base'], f"{model_name}_Embeddings")
    if not os.path.exists(directory):
        os.makedirs(directory)
    filepath = os.path.join(directory, config['embedding_output_file'])
    with open(filepath, 'wb') as f:
        pickle.dump(embeddings, f)
        print(f"Saved embeddings to {filepath}")

def save_to_csv(model_name, df):
    filepath = os.path.join(config['output_directory_base'], config['csv_output_file'])
    with open(filepath, 'a') as f:
        df['Model'] = model_name  # Add a column for the model name
        df.to_csv(f, header=f.tell()==0, index=False)  # Write header only if file is empty


def save_to_excel(writer, model_name, df):
    df.to_excel(writer, sheet_name=model_name, index=False)

def process_model(model_name, model, sentences, context, is_instructor=False):
    print (f"Processing {model_name}")
    df = None  # Initialize df to None
    all_embeddings = None  # Initialize all_embeddings to None
    try:
        print(f"Processing {model_name}...")
        if is_instructor:
            instruction = "Represent the sentence; Input: "
            all_inputs = [[instruction, context]] + [[instruction, sentence] for sentence in sentences]
            all_embeddings = model.encode(all_inputs)
        else:
            all_inputs = [context] + sentences
            if model_name == 'sionic-v1':
                all_embeddings = fetch_embeddings(all_inputs)
            else:
                all_embeddings = model.encode(all_inputs)
        df = calculate_similarities(all_embeddings, sentences, context)
        if df is not None:  # Only attempt to save if df is not None
            save_to_csv(model_name, df)
        print(f"Successfully processed {model_name}")
    except Exception as e:
        print(f"Error while processing {model_name}: {e}")
    finally:
        try:
            if df is not None:  # Only attempt to delete if df is not None
                del df
        except:
            pass
        try:
            if all_embeddings is not None:  # Only attempt to delete if all_embeddings is not None
                del all_embeddings
        except:
            pass
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

   
def process_context(context, sentences, models):
    config = {
        'output_directory_base': '/bighome/yaarbel/temp/',
        'csv_output_file': f'All_Model_Results_{context[:20]}.csv',  # Using the first 20 characters of context to keep filename manageable
        'embedding_output_file': f'embeddings_{context[:20]}.pkl',   # Similarly, for the embeddings filename
    }
    for model_name, model, is_instructor in models:
        process_model(model_name, model, sentences, context, is_instructor)
    print(f"Processed for context: {context[:50]}...")  # Printing the first 50 characters of the context for a brief log

# Define the set of sentences
sentences = [
    "flood", "broken water main", "heavy rainfall", "severe storm", "dam failure", 
    "tears of joy", "construction near a water body", "irrigation canals overflow",
    "improper drainage", "broken levee", "burst pipe", "monsoon rains", "tsunami",
    "wind", "police", "fire"
]

models= [
    ('sionic-v1', None, False),
    ('sentence-transformers/sentence-t5-xxl', SentenceTransformer('sentence-transformers/sentence-t5-xxl'), False),
    ('thenlper/gte-large', SentenceTransformer('thenlper/gte-large'), False),
    ('thenlper/gte-base', SentenceTransformer('thenlper/gte-base'), False),
    ('thenlper/gte-small', SentenceTransformer('thenlper/gte-small'), False),
    ('hkunlp/instructor-large', INSTRUCTOR('hkunlp/instructor-large'), True),
    ('hkunlp/instructor-xl', INSTRUCTOR('hkunlp/instructor-xl'), True),
    ('hkunlp/instructor-base', INSTRUCTOR('hkunlp/instructor-base'), True),
    ('BAAI/bge-large-en', SentenceTransformer('BAAI/bge-large-en'), False),
    ('BAAI/bge-base-en-v1.5', SentenceTransformer('BAAI/bge-base-en-v1.5'), False),
]


In [None]:
# Define the contexts
contexts = [
    "A flood exception in an insurance policy",
    "A natural disaster exception in an insurance policy",
    "A asdf123 exception in an insurance policy",
    '''We do not insure for loss caused directly or indirectly by any of the following. Such loss is excluded regardless of any other cause or event contributing concurrently or in any sequence to the loss. Water Damage, meaning:  Flood, surface water, waves, tidal water, overflow of a body of water, or spray from any of these, whether or not driven by wind.'''
]


for context in contexts:
    process_context(context, sentences, models)