<a href="https://colab.research.google.com/github/tubagokhan/RedScore/blob/main/CosineLexixalNLIV2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import nltk
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize

# Download NLTK tokenizer models (only needed once)
nltk.download("punkt")

# Load model and tokenizer for embeddings
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
print(f"Loading embedding model: {embedding_model_name}")
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)

# Load NLI model
nli_model_name = "roberta-large-mnli"
print(f"Loading NLI model: {nli_model_name}")
nli_pipeline = pipeline("text-classification", model=nli_model_name, return_all_scores=True)

# Function to split text into sentences using NLTK
def split_into_sentences(passage):
    sentences = sent_tokenize(passage)
    return sentences

# Function to calculate embeddings
def calculate_embeddings(sentences):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = embedding_model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.numpy()

# Function to calculate cosine similarity for a passage
def calculate_cosine_similarity(passage):
    sentences = split_into_sentences(passage)
    num_sentences = len(sentences)

    if num_sentences < 2:
        return 0.0  # Return 0 if fewer than 2 sentences

    embeddings = calculate_embeddings(sentences)
    similarity_matrix = cosine_similarity(embeddings)
    pairwise_similarities = similarity_matrix[np.triu_indices(num_sentences, k=1)]
    avg_similarity = np.mean(pairwise_similarities)
    return round(avg_similarity, 4)

# Function to calculate lexical overlap for a passage
def calculate_lexical_overlap(passage):
    sentences = split_into_sentences(passage)
    num_sentences = len(sentences)

    if num_sentences < 2:
        return 0.0  # Return 0 if fewer than 2 sentences

    # Tokenize words using word_tokenize for better accuracy
    combined_tokens = word_tokenize(" ".join(sentences).lower())
    token_counts = Counter(combined_tokens)
    repeated_tokens = sum(count - 1 for count in token_counts.values() if count > 1)
    total_tokens = len(combined_tokens)
    overlap_ratio = repeated_tokens / total_tokens if total_tokens > 0 else 0
    return round(overlap_ratio, 4)

# Function to calculate NLI score matrices for a passage
def calculate_nli_matrices(passage):
    sentences = split_into_sentences(passage)
    num_sentences = len(sentences)

    if num_sentences < 2:
        return 0.0, 0.0, 0.0  # Return zeros if fewer than 2 sentences

    entailment_matrix = np.zeros((num_sentences, num_sentences))
    neutral_matrix = np.zeros((num_sentences, num_sentences))
    contradiction_matrix = np.zeros((num_sentences, num_sentences))

    for i in range(num_sentences):
        for j in range(num_sentences):
            if i != j:  # Avoid self-comparison
                premise = sentences[i]
                hypothesis = sentences[j]
                result = nli_pipeline(f"{premise} [SEP] {hypothesis}")

                for score in result[0]:
                    if score["label"] == "ENTAILMENT":
                        entailment_matrix[i, j] = score["score"]
                    elif score["label"] == "NEUTRAL":
                        neutral_matrix[i, j] = score["score"]
                    elif score["label"] == "CONTRADICTION":
                        contradiction_matrix[i, j] = score["score"]

    entailment_score = np.mean(entailment_matrix[np.triu_indices(num_sentences, k=1)])
    neutral_score = np.mean(neutral_matrix[np.triu_indices(num_sentences, k=1)])
    contradiction_score = np.mean(contradiction_matrix[np.triu_indices(num_sentences, k=1)])

    return round(entailment_score, 4), round(neutral_score, 4), round(contradiction_score, 4)

# Function to calculate bidirectional NLI score matrices for a passage
def calculate_nli_matrices_bidirectional(passage):
    sentences = split_into_sentences(passage)
    num_sentences = len(sentences)

    if num_sentences < 2:
        return 0.0, 0.0, 0.0  # Return zeros if fewer than 2 sentences

    # Initialize matrices
    entailment_matrix = np.zeros((num_sentences, num_sentences))
    neutral_matrix = np.zeros((num_sentences, num_sentences))
    contradiction_matrix = np.zeros((num_sentences, num_sentences))

    for i in range(num_sentences):
        for j in range(num_sentences):
            if i != j:  # Avoid self-comparison
                # First direction: Premise = i, Hypothesis = j
                result_1 = nli_pipeline(f"{sentences[i]} [SEP] {sentences[j]}")
                # Second direction: Premise = j, Hypothesis = i
                result_2 = nli_pipeline(f"{sentences[j]} [SEP] {sentences[i]}")

                # Aggregate scores for both directions
                for score in result_1[0]:
                    if score["label"] == "ENTAILMENT":
                        entailment_matrix[i, j] += score["score"]
                    elif score["label"] == "NEUTRAL":
                        neutral_matrix[i, j] += score["score"]
                    elif score["label"] == "CONTRADICTION":
                        contradiction_matrix[i, j] += score["score"]

                for score in result_2[0]:
                    if score["label"] == "ENTAILMENT":
                        entailment_matrix[i, j] += score["score"]
                    elif score["label"] == "NEUTRAL":
                        neutral_matrix[i, j] += score["score"]
                    elif score["label"] == "CONTRADICTION":
                        contradiction_matrix[i, j] += score["score"]

                # Normalize the scores (average for the two directions)
                entailment_matrix[i, j] /= 2
                neutral_matrix[i, j] /= 2
                contradiction_matrix[i, j] /= 2

    # Extract upper triangular values (excluding the diagonal)
    entailment_score = np.mean(entailment_matrix[np.triu_indices(num_sentences, k=1)])
    neutral_score = np.mean(neutral_matrix[np.triu_indices(num_sentences, k=1)])
    contradiction_score = np.mean(contradiction_matrix[np.triu_indices(num_sentences, k=1)])

    return round(entailment_score, 4), round(neutral_score, 4), round(contradiction_score, 4)

# Load the Excel file
input_file = "/content/SampleRedundancyCases.xlsx"  # Change to your file path
output_file = "/content/SampleRedundancyCases_Updated.xlsx"  # Output file path

print(f"Reading input file: {input_file}")
df = pd.read_excel(input_file)

# Ensure result columns exist
if "Cosine Similarity" not in df.columns:
    df["Cosine Similarity"] = ""
if "Lexical Overlap" not in df.columns:
    df["Lexical Overlap"] = ""
if "Entailment Score" not in df.columns:
    df["Entailment Score"] = ""
if "Neutral Score" not in df.columns:
    df["Neutral Score"] = ""
if "Contradiction Score" not in df.columns:
    df["Contradiction Score"] = ""
if "Dominant Score" not in df.columns:
    df["Dominant Score"] = ""
if "Entailment Score (Bi)" not in df.columns:
    df["Entailment Score (Bi)"] = ""
if "Neutral Score (Bi)" not in df.columns:
    df["Neutral Score (Bi)"] = ""
if "Contradiction Score (Bi)" not in df.columns:
    df["Contradiction Score (Bi)"] = ""
if "Dominant Score (Bi)" not in df.columns:
    df["Dominant Score (Bi)"] = ""

# Process each row
print("Starting processing with both NLI methods...")
for index, row in df.iterrows():
    sentence1 = row.get("Sentence 1", "")
    sentence2 = row.get("Sentence 2", "")

    if pd.notna(sentence1) and pd.notna(sentence2):
        passage = f"{sentence1} {sentence2}"
        print(f"\nProcessing row {index + 1}...")

        # Calculate similarities and scores
        cosine_sim = calculate_cosine_similarity(passage)
        lexical_overlap = calculate_lexical_overlap(passage)
        entailment_score, neutral_score, contradiction_score = calculate_nli_matrices(passage)
        entailment_score_bi, neutral_score_bi, contradiction_score_bi = calculate_nli_matrices_bidirectional(passage)

        # Determine the dominant score for original NLI
        scores = {
            "Entailment": entailment_score,
            "Neutral": neutral_score,
            "Contradiction": contradiction_score,
        }
        dominant_score = max(scores, key=scores.get)

        # Determine the dominant score for bidirectional NLI
        scores_bi = {
            "Entailment (Bi)": entailment_score_bi,
            "Neutral (Bi)": neutral_score_bi,
            "Contradiction (Bi)": contradiction_score_bi,
        }
        dominant_score_bi = max(scores_bi, key=scores_bi.get)

        # Save results
        df.at[index, "Cosine Similarity"] = cosine_sim
        df.at[index, "Lexical Overlap"] = lexical_overlap
        df.at[index, "Entailment Score"] = entailment_score
        df.at[index, "Neutral Score"] = neutral_score
        df.at[index, "Contradiction Score"] = contradiction_score
        df.at[index, "Dominant Score"] = dominant_score
        df.at[index, "Entailment Score (Bi)"] = entailment_score_bi
        df.at[index, "Neutral Score (Bi)"] = neutral_score_bi
        df.at[index, "Contradiction Score (Bi)"] = contradiction_score_bi
        df.at[index, "Dominant Score (Bi)"] = dominant_score_bi
    else:
        print(f"Skipping row {index + 1}: Missing sentence(s)")

# Save the results back to Excel
print(f"Saving results to {output_file}")
df.to_excel(output_file, index=False)
print("Processing complete!")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
Loading NLI model: roberta-large-mnli


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Reading input file: /content/SampleRedundancyCases.xlsx
Starting processing with both NLI methods...

Processing row 1...

Processing row 2...

Processing row 3...

Processing row 4...

Processing row 5...

Processing row 6...

Processing row 7...

Processing row 8...

Processing row 9...

Processing row 10...

Processing row 11...

Processing row 12...

Processing row 13...

Processing row 14...

Processing row 15...

Processing row 16...

Processing row 17...

Processing row 18...

Processing row 19...

Processing row 20...

Processing row 21...

Processing row 22...

Processing row 23...

Processing row 24...

Processing row 25...

Processing row 26...

Processing row 27...

Processing row 28...

Processing row 29...

Processing row 30...

Processing row 31...

Processing row 32...

Processing row 33...

Processing row 34...

Processing row 35...

Processing row 36...

Processing row 37...

Processing row 38...

Processing row 39...

Processing row 40...

Processing row 41...

Proce