In [None]:
!pip install bert-score rouge sentence-transformers scikit-learn textstat textblob gensim transformers nltk torch

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.0-py3-none-any.whl.metadata (3.2 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rouge-1.0.1-py3-none-any.whl (13 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rouge, pyphen, textstat, bert-score
Succe

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile
import os
import pandas as pd
from bert_score import score
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import textstat
from textblob import TextBlob
from gensim import corpora, models
from transformers import pipeline

# Define paths for the zip files
single_agent_zip_path = '/content/drive/MyDrive/SingleVsMultiAgent/single-agent.zip'
multi_agent_zip_path = '/content/drive/MyDrive/SingleVsMultiAgent/multi-agent.zip'

# Extract the files from the zip archives
with zipfile.ZipFile(single_agent_zip_path, 'r') as single_zip:
    single_zip.extractall('single_agent_extracted')

with zipfile.ZipFile(multi_agent_zip_path, 'r') as multi_zip:
    multi_zip.extractall('multi_agent_extracted')

# Initialize metrics collectors
metrics = {
    "Company": [],
    "BERTScore_P": [],
    "BERTScore_R": [],
    "BERTScore_F1": [],
    "ROUGE_L": [],
    "Cosine_Similarity": [],
    "Flesch_Readability_Single": [],
    "Flesch_Readability_Multi": [],
    "Sentiment_Single": [],
    "Sentiment_Multi": [],
    "Length_Single": [],
    "Length_Multi": []
}

# Initialize models and scorers
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
rouge = Rouge()
nli_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Iterate over each pair of files
for i in range(1, 13):
    company_name = f"company_{i}"

    # Read single-agent and multi-agent summaries
    single_file_path = f'single_agent_extracted/2024-11-28/{company_name}/{company_name}_esg_summary_2024-11-28.txt'
    multi_file_path = f'multi_agent_extracted/2024-11-28/{company_name}/{company_name}_esg_summary_2024-11-28.txt'

    with open(single_file_path, 'r') as file:
        summary_single_agent = file.read()

    with open(multi_file_path, 'r') as file:
        summary_multi_agent = file.read()

    # BERTScore
    P, R, F1 = score([summary_single_agent], [summary_multi_agent], lang='en', verbose=False)

    # ROUGE Score
    rouge_scores = rouge.get_scores(summary_single_agent, summary_multi_agent)[0]['rouge-l']['f']

    # Embedding Cosine Similarity
    embeddings = bert_model.encode([summary_single_agent, summary_multi_agent])
    similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

    # Readability Metrics
    readability_single = textstat.flesch_reading_ease(summary_single_agent)
    readability_multi = textstat.flesch_reading_ease(summary_multi_agent)

    # Sentiment Analysis
    sentiment_single = TextBlob(summary_single_agent).sentiment.polarity
    sentiment_multi = TextBlob(summary_multi_agent).sentiment.polarity

    # Length
    len_single = len(summary_single_agent.split())
    len_multi = len(summary_multi_agent.split())

    # Append results to the metrics
    metrics["Company"].append(company_name)
    metrics["BERTScore_P"].append(P.mean().item())
    metrics["BERTScore_R"].append(R.mean().item())
    metrics["BERTScore_F1"].append(F1.mean().item())
    metrics["ROUGE_L"].append(rouge_scores)
    metrics["Cosine_Similarity"].append(similarity)
    metrics["Flesch_Readability_Single"].append(readability_single)
    metrics["Flesch_Readability_Multi"].append(readability_multi)
    metrics["Sentiment_Single"].append(sentiment_single)
    metrics["Sentiment_Multi"].append(sentiment_multi)
    metrics["Length_Single"].append(len_single)
    metrics["Length_Multi"].append(len_multi)

# Convert results to a DataFrame and calculate averages
df_metrics = pd.DataFrame(metrics)
average_metrics = df_metrics.mean(numeric_only=True)

# Display the results in a table
print("\nIndividual Metrics for Each Company:\n")
print(df_metrics)

print("\nAverage Metrics Across All Companies:\n")
print(average_metrics)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro


Individual Metrics for Each Company:

       Company  BERTScore_P  BERTScore_R  BERTScore_F1   ROUGE_L  \
0    company_1     0.892775     0.878686      0.885674  0.388889   
1    company_2     0.859409     0.871029      0.865180  0.290727   
2    company_3     0.876884     0.860378      0.868553  0.390244   
3    company_4     0.876477     0.890759      0.883561  0.456000   
4    company_5     0.867480     0.880815      0.874097  0.299270   
5    company_6     0.869389     0.846328      0.857704  0.236686   
6    company_7     0.820952     0.854793      0.837531  0.160000   
7    company_8     0.871076     0.880017      0.875524  0.264706   
8    company_9     0.848584     0.880598      0.864295  0.314917   
9   company_10     0.849356     0.827314      0.838190  0.187192   
10  company_11     0.855263     0.883265      0.869039  0.272425   
11  company_12     0.872394     0.873836      0.873114  0.294872   

    Cosine_Similarity  Flesch_Readability_Single  Flesch_Readability_Multi  

In [None]:
# Import necessary libraries
import os

# Read the summaries from the text files
with open('single.txt', 'r') as file:
    summary_single_agent = file.read()

with open('multi.txt', 'r') as file:
    summary_multi_agent = file.read()

# BERTScore
from bert_score import score
candidate_summaries = [summary_single_agent]
reference_summaries = [summary_multi_agent]
P, R, F1 = score(candidate_summaries, reference_summaries, lang='en', verbose=True)
print(f"\nBERTScore:")
print(f"Precision: {P.mean().item():.4f}")
print(f"Recall: {R.mean().item():.4f}")
print(f"F1 Score: {F1.mean().item():.4f}")

# ROUGE Score
from rouge import Rouge
rouge = Rouge()
scores = rouge.get_scores(summary_single_agent, summary_multi_agent)
print(f"\nROUGE Scores:")
print(scores)

# Embedding Cosine Similarity
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode([summary_single_agent, summary_multi_agent])
similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
print(f"\nEmbedding Cosine Similarity: {similarity:.4f}")

# Readability Metrics
import textstat
readability_single = textstat.flesch_reading_ease(summary_single_agent)
readability_multi = textstat.flesch_reading_ease(summary_multi_agent)
print(f"\nReadability Scores (Flesch Reading Ease):")
print(f"Single Agent Readability: {readability_single:.2f}")
print(f"Multi-Agent Readability: {readability_multi:.2f}")

# Sentiment Analysis
from textblob import TextBlob
sentiment_single = TextBlob(summary_single_agent).sentiment.polarity
sentiment_multi = TextBlob(summary_multi_agent).sentiment.polarity
print(f"\nSentiment Polarity Scores:")
print(f"Single Agent Sentiment Polarity: {sentiment_single:.2f}")
print(f"Multi-Agent Sentiment Polarity: {sentiment_multi:.2f}")

# Length
len_single = len(summary_single_agent.split())
len_multi = len(summary_multi_agent.split())
print(f"\nSummary Lengths (number of words):")
print(f"Single Agent Length: {len_single}")
print(f"Multi-Agent Length: {len_multi}")

# Topic Modeling
from gensim import corpora, models
from pprint import pprint

texts = [summary_single_agent.lower().split(), summary_multi_agent.lower().split()]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Build the LDA model
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15, random_state=42)

print(f"\nTopics in Single Agent Summary:")
topics_single = lda_model.show_topics(formatted=False, num_words=5)
pprint(topics_single)

# Since we have only two documents, the same model is used for both
print(f"\nTopics in Multi-Agent Summary:")
topics_multi = lda_model.show_topics(formatted=False, num_words=5)
pprint(topics_multi)

# NLI-based Evaluation
from transformers import pipeline
nli_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define labels for NLI
labels = ["paraphrase", "not paraphrase"]

print("\nNLI-based Evaluation:")
result_single_hypo = nli_model(summary_single_agent, candidate_labels=labels, hypothesis_template="This text is {}.")
result_multi_hypo = nli_model(summary_multi_agent, candidate_labels=labels, hypothesis_template="This text is {}.")

print(f"Single Agent Summary NLI Result:")
pprint(result_single_hypo)

print(f"\nMulti-Agent Summary NLI Result:")
pprint(result_multi_hypo)

# Comparing if one summary entails the other
nli_result = nli_model(summary_single_agent, candidate_labels=["entailment", "contradiction", "neutral"], hypothesis_template="{}")

print(f"\nNLI Comparison Result between Summaries:")
pprint(nli_result)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 6.00 seconds, 0.17 sentences/sec

BERTScore:
Precision: 0.8691
Recall: 0.8725
F1 Score: 0.8708

ROUGE Scores:
[{'rouge-1': {'r': 0.3533834586466165, 'p': 0.3092105263157895, 'f': 0.329824556425731}, 'rouge-2': {'r': 0.11351351351351352, 'p': 0.0963302752293578, 'f': 0.10421835731640512}, 'rouge-l': {'r': 0.3383458646616541, 'p': 0.29605263157894735, 'f': 0.31578946870643276}}]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Embedding Cosine Similarity: 0.8207

Readability Scores (Flesch Reading Ease):
Single Agent Readability: 10.60
Multi-Agent Readability: 4.71

Sentiment Polarity Scores:
Single Agent Sentiment Polarity: 0.15
Multi-Agent Sentiment Polarity: 0.13

Summary Lengths (number of words):
Single Agent Length: 230
Multi-Agent Length: 196

Topics in Single Agent Summary:
[(0,
  [('and', 0.04619716),
   ('esg', 0.029696476),
   ('the', 0.021447547),
   ('social', 0.02144724),
   ('of', 0.017323896)]),
 (1,
  [('and', 0.004375748),
   ('a', 0.0043379357),
   ('esg', 0.0043374486),
   ('the', 0.004333412),
   ('on', 0.0043311715)]),
 (2,
  [('and', 0.004485421),
   ('esg', 0.0043908753),
   ('of', 0.004374836),
   ('to', 0.0043740594),
   ('social', 0.0043723695)]),
 (3,
  [('and', 0.0043913336),
   ('esg', 0.004354241),
   ('the', 0.0043522986),
   ('on', 0.0043438068),
   ('to', 0.0043359664)]),
 (4,
  [('and', 0.040518545),
   ('esg', 0.026056642),
   ('a', 0.022435158),
   ('*', 0.022423128),
  

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


NLI-based Evaluation:
Single Agent Summary NLI Result:
{'labels': ['paraphrase', 'not paraphrase'],
 'scores': [0.6559714674949646, 0.3440285325050354],
 'sequence': '## Salesforce ESG Summary & Score\n'
             '\n'
             '**Summary:**\n'
             '\n'
             'Salesforce presents a strong commitment to ESG principles, '
             'demonstrating this through robust governance structures and the '
             'integration of ESG goals into executive compensation. They '
             'highlight diversity and inclusion efforts, ethical business '
             'practices, and a focus on employee well-being. However, the '
             'report lacks specific quantitative data on environmental impact '
             'and social initiatives, limiting a full assessment.\n'
             '\n'
             '**ESG Score:**\n'
             '\n'
             '* **Environmental: 6/10** - Limited data available but '
             'demonstrates commitment to diversity and incl