### Init vLLM

In [None]:
import os
import json
import torch
import time
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

# Specify the model ID and number of GPUs
model_id = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
number_gpus = 2

# Set CUDA_VISIBLE_DEVICES to specify GPUs 5 and 7
os.environ["CUDA_VISIBLE_DEVICES"] = "5,7"

# Load the model using vLLM
model = LLM(model=model_id, tensor_parallel_size=number_gpus)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

# Set up the sampling parameters
sampling_params = SamplingParams(
    temperature=0.2,
    max_tokens=6500,
    n=100  # Number of generations per prompt
)


# Mistral 3 summaries

## Aspect Covergae 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m3_50.json", "r") as file:
    products = json.load(file)
    
aspect_coverage_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating. You will then be given one summary written for the set of information. Your task is to rate the summary on one metric. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Aspect Coverage - Aspect Coverage measures how completely a summary captures the major features, characteristics, or attributes of a product that are prominently discussed in the original product information. Summaries should be penalized for missing any major aspects and rewarded for covering all important aspects thoroughly.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - Summary does not cover any important aspects present in the set of information.
<score>2</score> - Summary does not cover most of the important aspects present in the set of information.
<score>3</score> - Summary covers around half of the important aspects present in the set of information.
<score>4</score> - Summary covers most of the important aspects present in the set of information.
<score>5</score> - Summary covers all the important aspects discussed in the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the important aspects present in the set of information and list them with numbering.
2. Identify the important aspects present in the summary and list them with numbering.
3. Identify the important aspects covered by the summary that are present in the set of information and list them with numbering.
4. Calculate the total number of important aspects covered by the summary that are present in the set of information.
5. Calculate the total number of important aspects present in the set of information.
6. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation of how much is the coverage and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''
# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = aspect_coverage_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("m3_ac_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Fluency 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m3_50.json", "r") as file:
    products = json.load(file)

fluency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Fluency : The quality of summary in terms of grammar, spelling, punctuation, capitalization, word choice, and sentence structure and should contain no errors. The summary should be easy to read, follow, comprehend and should contain no errors.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary is all garbled and does not make any sense.
<score>2</score> - The summary has grammatical errors that make it hard to understand or sound unnatural.
<score>3</score> - The summary has errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
<score>4</score> - The summary has very few errors, but it is easy to read, follow and comprehend.
<score>5</score> - The summary is extremely fluent and is easy to read, follow and comprehend.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the sentences presented in the summary and list them with numbering.
2. Go through each sentence and list down if there are any fluency problems.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on fluency of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = fluency_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("m3_fl_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Coherence

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m3_50.json", "r") as file:
    products = json.load(file)

coherence_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Coherence - The collective quality of all sentences. The summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to a coherent body of information.
Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary lacks structure and logical flow, resulting in disjointed ideas and significant inconsistencies, making it confusing and challenging to follow.
<score>2</score> - The summary attempts coherence but struggles with occasional lapses in logic, clarity issues, and insufficiently connected ideas, leading to a somewhat disjointed presentation.
<score>3</score> - The summary displays a reasonable level of coherence with a logical sequence, yet occasional disruptions in flow and clarity, requiring some improvements for a smoother transition between ideas.
<score>4</score> - The summary demonstrates strong coherence, maintaining a clear and organized flow with effective transitions and minimal inconsistencies, effectively conveying main points with clarity and precision.
<score>5</score> - The summary showcases exceptional coherence with a flawless logical flow, impeccable transitions, and consistent clarity, presenting information in an impeccably organized and easily comprehensible manner.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary.
2. Check if everything is presented in a clear and logical order. Give a clear step-by-step explanation of what you found and what is lacking.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation on coherence of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = coherence_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("m3_co_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Faithfulness

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m3_50.json", "r") as file:
    products = json.load(file)

faithfulness_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Faithfulness - Faithfulness measures the extent to which every piece of information mentioned in the summary is verifiable, supported, present, or can be reasonably inferred from the input. The input includes product title, description, key features, specifications, reviews, and average rating. Summaries should be penalized if they contain information that cannot be verified from the provided input or if they make broad generalizations that are not supported by the input data.

Following are the scores and the evaluation criteria according to which scores must be assigned.

<score>1</score> - The summary is for a different product and is irrelevant/unrelated to the given set of information.
<score>2</score> - The summary contains very few facts actually verifiable/supported/present/inferred from the set of information and contains a lot of hallucinated facts.
<score>3</score> - The summary contains more than one piece of information that is not verifiable/present/inferred from the set of information.
<score>4</score> - The summary contains only one piece of information that is not verifiable/supported/present/inferred from the the set of information.
<score>5</score> - Every piece of information present in the summary is verifiable/supported/present/inferred from the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary that is not verifiable/supported/present/inferred from the set of information. Give a clear step-by-step explanation of what you found.
2. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on faithfulness and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = faithfulness_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("m3_fa_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))


    

# Relevance

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m3_50.json", "r") as file:
    products = json.load(file)

relevance_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Relevance - Relevance measures the selection of important information from the input, including product title, description, key features, specifications, reviews, and average rating. The summary should include only important and relevant information from the input. Summaries should not contain redundancies or excess information.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary misses all the important opinions majorly discussed in the set of information.
<score>2</score> - The summary misses most of the important opinions majorly discussed in the set of information or mostly has redundant/excess/unimportant details
<score>3</score> - The summary covers around half of the important opinions majorly discussed in the set of information. or contains redundant/excess/unimportant details.
<score>4</score> - The summary covers most of the important opinions majorly discussed in the set of information and has very less amount of redundant/excess/unimportant details.
<score>5</score> - The summary covers all the important opinions majorly discussed in the set of information and has no redundant/excess/unimportant details.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify all the important opinions majorly discussed in the set of information and list them with numbering.
2. Identify the important opinions present in the summary and list them with numbering.
3. Next identify how many important opinions are present in both summary and the set of information and list them with numbering
4. Next idenify the how many redundant/excess/unimportant details does the summary have and list them with numbering.
5. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on relevance and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = relevance_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("m3_re_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Sentiment Consistency

In [None]:
import json

# Load the JSON file containing the product information and opinion summaries

with open("m3_50.json", "r") as file:
    products = json.load(file)

sentiment_consistency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description: 
You will be given a set of information such as reviews, and average rating and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - None of the aspects present in summary have the same majority sentiment as in reviews.
<score>2</score> - Very few of the aspects present in summary have the same majority sentiment as in reviews.
<score>3</score> - Only around half of the aspects present in summary have the same majority sentiment as in reviews.
<score>4</score> - Most of the aspects present in summary have the same majority sentiment as in reviews.
<score>5</score> - All aspects present in summary have the same majority sentiment as in reviews.


Product Reviews: {reviews}

Average Rating: {average_rating}

Summary to Evaluate: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the aspects and their sentiment present in the summary and list them with numbering.
2. For the list of aspects identified, identify the majority sentiment from the reviews and list them with numbering.
3. Next identify how many aspect and sentiment match between reviews and summary from above and list them with numbering.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on sentiment preservation of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary

def evaluate_opinion_summary(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    
    return prompt


# Set up sampling parameters



# Process products in batches

batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    

    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**Product {i + j + 1} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file

with open("m3_sc_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

# Specificity

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m3_50.json", "r") as file:
    products = json.load(file)

specificity_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Generic Opinion example: The battery is good.
Specific Opinion example: The battery lasts for more than 12 hours on a single charge.

Specificity - Specificity measures the level of detail and precision in the information and opinions presented in the summary. A specific summary provides concrete facts, measurements, or detailed descriptions about the product's features, performance, and user experiences. It avoids vague or general statements and instead offers precise information that gives readers a clear and thorough understanding of the product's characteristics and performance. 

Summaries should be penalized for including vague or generic statements and rewarded for providing detailed, precise information about the product and user experiences.

<score>1</score> - All the opinions presented in the summary are generic.
<score>2</score> - Most of the opinions presented are generic.
<score>3</score> - Only around half of the opinions presented are specific.
<score>4</score> - Most of the opinions presented in the summary are specific. Very few opinions are generic.
<score>5</score> - All the opinions presented in the summary are specific 


Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all the opinions presented.
2. Check if details are presented for the opinions. Classify each opinion as specific or generic.
3. Count the number of generic and specific occurrences.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on specificity of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = specificity_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("m3_sp_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))
    

    

# Mistral 2 summaries

## Aspect Covergae 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m2.json", "r") as file:
    products = json.load(file)
    
aspect_coverage_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating. You will then be given one summary written for the set of information. Your task is to rate the summary on one metric. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Aspect Coverage - Aspect Coverage measures how completely a summary captures the major features, characteristics, or attributes of a product that are prominently discussed in the original product information. Summaries should be penalized for missing any major aspects and rewarded for covering all important aspects thoroughly.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - Summary does not cover any important aspects present in the set of information.
<score>2</score> - Summary does not cover most of the important aspects present in the set of information.
<score>3</score> - Summary covers around half of the important aspects present in the set of information.
<score>4</score> - Summary covers most of the important aspects present in the set of information.
<score>5</score> - Summary covers all the important aspects discussed in the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the important aspects present in the set of information and list them with numbering.
2. Identify the important aspects present in the summary and list them with numbering.
3. Identify the important aspects covered by the summary that are present in the set of information and list them with numbering.
4. Calculate the total number of important aspects covered by the summary that are present in the set of information.
5. Calculate the total number of important aspects present in the set of information.
6. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation of how much is the coverage and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''
# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = aspect_coverage_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("m2_ac_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Fluency 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m2.json", "r") as file:
    products = json.load(file)

fluency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Fluency : The quality of summary in terms of grammar, spelling, punctuation, capitalization, word choice, and sentence structure and should contain no errors. The summary should be easy to read, follow, comprehend and should contain no errors.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary is all garbled and does not make any sense.
<score>2</score> - The summary has grammatical errors that make it hard to understand or sound unnatural.
<score>3</score> - The summary has errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
<score>4</score> - The summary has very few errors, but it is easy to read, follow and comprehend.
<score>5</score> - The summary is extremely fluent and is easy to read, follow and comprehend.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the sentences presented in the summary and list them with numbering.
2. Go through each sentence and list down if there are any fluency problems.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on fluency of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = fluency_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("m2_fl_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Coherence

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m2.json", "r") as file:
    products = json.load(file)

coherence_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Coherence - The collective quality of all sentences. The summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to a coherent body of information.
Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary lacks structure and logical flow, resulting in disjointed ideas and significant inconsistencies, making it confusing and challenging to follow.
<score>2</score> - The summary attempts coherence but struggles with occasional lapses in logic, clarity issues, and insufficiently connected ideas, leading to a somewhat disjointed presentation.
<score>3</score> - The summary displays a reasonable level of coherence with a logical sequence, yet occasional disruptions in flow and clarity, requiring some improvements for a smoother transition between ideas.
<score>4</score> - The summary demonstrates strong coherence, maintaining a clear and organized flow with effective transitions and minimal inconsistencies, effectively conveying main points with clarity and precision.
<score>5</score> - The summary showcases exceptional coherence with a flawless logical flow, impeccable transitions, and consistent clarity, presenting information in an impeccably organized and easily comprehensible manner.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary.
2. Check if everything is presented in a clear and logical order. Give a clear step-by-step explanation of what you found and what is lacking.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation on coherence of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = coherence_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("m2_co_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Faithfulness

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m2.json", "r") as file:
    products = json.load(file)

faithfulness_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Faithfulness - Faithfulness measures the extent to which every piece of information mentioned in the summary is verifiable, supported, present, or can be reasonably inferred from the input. The input includes product title, description, key features, specifications, reviews, and average rating. Summaries should be penalized if they contain information that cannot be verified from the provided input or if they make broad generalizations that are not supported by the input data.

Following are the scores and the evaluation criteria according to which scores must be assigned.

<score>1</score> - The summary is for a different product and is irrelevant/unrelated to the given set of information.
<score>2</score> - The summary contains very few facts actually verifiable/supported/present/inferred from the set of information and contains a lot of hallucinated facts.
<score>3</score> - The summary contains more than one piece of information that is not verifiable/present/inferred from the set of information.
<score>4</score> - The summary contains only one piece of information that is not verifiable/supported/present/inferred from the the set of information.
<score>5</score> - Every piece of information present in the summary is verifiable/supported/present/inferred from the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary that is not verifiable/supported/present/inferred from the set of information. Give a clear step-by-step explanation of what you found.
2. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on faithfulness and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = faithfulness_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("m2_fa_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))


    

# Relevance

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m2.json", "r") as file:
    products = json.load(file)

relevance_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Relevance - Relevance measures the selection of important information from the input, including product title, description, key features, specifications, reviews, and average rating. The summary should include only important and relevant information from the input. Summaries should not contain redundancies or excess information.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary misses all the important opinions majorly discussed in the set of information.
<score>2</score> - The summary misses most of the important opinions majorly discussed in the set of information or mostly has redundant/excess/unimportant details
<score>3</score> - The summary covers around half of the important opinions majorly discussed in the set of information. or contains redundant/excess/unimportant details.
<score>4</score> - The summary covers most of the important opinions majorly discussed in the set of information and has very less amount of redundant/excess/unimportant details.
<score>5</score> - The summary covers all the important opinions majorly discussed in the set of information and has no redundant/excess/unimportant details.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify all the important opinions majorly discussed in the set of information and list them with numbering.
2. Identify the important opinions present in the summary and list them with numbering.
3. Next identify how many important opinions are present in both summary and the set of information and list them with numbering
4. Next idenify the how many redundant/excess/unimportant details does the summary have and list them with numbering.
5. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on relevance and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = relevance_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("m2_re_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Sentiment Consistency

In [None]:
import json

# Load the JSON file containing the product information and opinion summaries

with open("m2.json", "r") as file:
    products = json.load(file)

sentiment_consistency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description: 
You will be given a set of information such as reviews, and average rating and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - None of the aspects present in summary have the same majority sentiment as in reviews.
<score>2</score> - Very few of the aspects present in summary have the same majority sentiment as in reviews.
<score>3</score> - Only around half of the aspects present in summary have the same majority sentiment as in reviews.
<score>4</score> - Most of the aspects present in summary have the same majority sentiment as in reviews.
<score>5</score> - All aspects present in summary have the same majority sentiment as in reviews.


Product Reviews: {reviews}

Average Rating: {average_rating}

Summary to Evaluate: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the aspects and their sentiment present in the summary and list them with numbering.
2. For the list of aspects identified, identify the majority sentiment from the reviews and list them with numbering.
3. Next identify how many aspect and sentiment match between reviews and summary from above and list them with numbering.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on sentiment preservation of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary

def evaluate_opinion_summary(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    
    return prompt


# Set up sampling parameters



# Process products in batches

batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    

    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**Product {i + j + 1} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file

with open("m2_sc_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

# Specificity

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m2.json", "r") as file:
    products = json.load(file)

specificity_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Generic Opinion example: The battery is good.
Specific Opinion example: The battery lasts for more than 12 hours on a single charge.

Specificity - Specificity measures the level of detail and precision in the information and opinions presented in the summary. A specific summary provides concrete facts, measurements, or detailed descriptions about the product's features, performance, and user experiences. It avoids vague or general statements and instead offers precise information that gives readers a clear and thorough understanding of the product's characteristics and performance. 

Summaries should be penalized for including vague or generic statements and rewarded for providing detailed, precise information about the product and user experiences.

<score>1</score> - All the opinions presented in the summary are generic.
<score>2</score> - Most of the opinions presented are generic.
<score>3</score> - Only around half of the opinions presented are specific.
<score>4</score> - Most of the opinions presented in the summary are specific. Very few opinions are generic.
<score>5</score> - All the opinions presented in the summary are specific 


Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all the opinions presented.
2. Check if details are presented for the opinions. Classify each opinion as specific or generic.
3. Count the number of generic and specific occurrences.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on specificity of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = specificity_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("m2_sp_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))
    

    

# Llama 3 summaries

## Aspect Covergae 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("l.json", "r") as file:
    products = json.load(file)
    
aspect_coverage_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating. You will then be given one summary written for the set of information. Your task is to rate the summary on one metric. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Aspect Coverage - Aspect Coverage measures how completely a summary captures the major features, characteristics, or attributes of a product that are prominently discussed in the original product information. Summaries should be penalized for missing any major aspects and rewarded for covering all important aspects thoroughly.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - Summary does not cover any important aspects present in the set of information.
<score>2</score> - Summary does not cover most of the important aspects present in the set of information.
<score>3</score> - Summary covers around half of the important aspects present in the set of information.
<score>4</score> - Summary covers most of the important aspects present in the set of information.
<score>5</score> - Summary covers all the important aspects discussed in the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the important aspects present in the set of information and list them with numbering.
2. Identify the important aspects present in the summary and list them with numbering.
3. Identify the important aspects covered by the summary that are present in the set of information and list them with numbering.
4. Calculate the total number of important aspects covered by the summary that are present in the set of information.
5. Calculate the total number of important aspects present in the set of information.
6. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation of how much is the coverage and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''
# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = aspect_coverage_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("l_ac_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Fluency 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("l.json", "r") as file:
    products = json.load(file)

fluency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Fluency : The quality of summary in terms of grammar, spelling, punctuation, capitalization, word choice, and sentence structure and should contain no errors. The summary should be easy to read, follow, comprehend and should contain no errors.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary is all garbled and does not make any sense.
<score>2</score> - The summary has grammatical errors that make it hard to understand or sound unnatural.
<score>3</score> - The summary has errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
<score>4</score> - The summary has very few errors, but it is easy to read, follow and comprehend.
<score>5</score> - The summary is extremely fluent and is easy to read, follow and comprehend.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the sentences presented in the summary and list them with numbering.
2. Go through each sentence and list down if there are any fluency problems.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on fluency of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = fluency_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("l_fl_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Coherence

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("l.json", "r") as file:
    products = json.load(file)

coherence_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Coherence - The collective quality of all sentences. The summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to a coherent body of information.
Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary lacks structure and logical flow, resulting in disjointed ideas and significant inconsistencies, making it confusing and challenging to follow.
<score>2</score> - The summary attempts coherence but struggles with occasional lapses in logic, clarity issues, and insufficiently connected ideas, leading to a somewhat disjointed presentation.
<score>3</score> - The summary displays a reasonable level of coherence with a logical sequence, yet occasional disruptions in flow and clarity, requiring some improvements for a smoother transition between ideas.
<score>4</score> - The summary demonstrates strong coherence, maintaining a clear and organized flow with effective transitions and minimal inconsistencies, effectively conveying main points with clarity and precision.
<score>5</score> - The summary showcases exceptional coherence with a flawless logical flow, impeccable transitions, and consistent clarity, presenting information in an impeccably organized and easily comprehensible manner.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary.
2. Check if everything is presented in a clear and logical order. Give a clear step-by-step explanation of what you found and what is lacking.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation on coherence of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = coherence_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("l_co_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Faithfulness

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("l.json", "r") as file:
    products = json.load(file)

faithfulness_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Faithfulness - Faithfulness measures the extent to which every piece of information mentioned in the summary is verifiable, supported, present, or can be reasonably inferred from the input. The input includes product title, description, key features, specifications, reviews, and average rating. Summaries should be penalized if they contain information that cannot be verified from the provided input or if they make broad generalizations that are not supported by the input data.

Following are the scores and the evaluation criteria according to which scores must be assigned.

<score>1</score> - The summary is for a different product and is irrelevant/unrelated to the given set of information.
<score>2</score> - The summary contains very few facts actually verifiable/supported/present/inferred from the set of information and contains a lot of hallucinated facts.
<score>3</score> - The summary contains more than one piece of information that is not verifiable/present/inferred from the set of information.
<score>4</score> - The summary contains only one piece of information that is not verifiable/supported/present/inferred from the the set of information.
<score>5</score> - Every piece of information present in the summary is verifiable/supported/present/inferred from the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary that is not verifiable/supported/present/inferred from the set of information. Give a clear step-by-step explanation of what you found.
2. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on faithfulness and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = faithfulness_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("l_fa_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))


    

# Relevance

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("l.json", "r") as file:
    products = json.load(file)

relevance_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Relevance - Relevance measures the selection of important information from the input, including product title, description, key features, specifications, reviews, and average rating. The summary should include only important and relevant information from the input. Summaries should not contain redundancies or excess information.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary misses all the important opinions majorly discussed in the set of information.
<score>2</score> - The summary misses most of the important opinions majorly discussed in the set of information or mostly has redundant/excess/unimportant details
<score>3</score> - The summary covers around half of the important opinions majorly discussed in the set of information. or contains redundant/excess/unimportant details.
<score>4</score> - The summary covers most of the important opinions majorly discussed in the set of information and has very less amount of redundant/excess/unimportant details.
<score>5</score> - The summary covers all the important opinions majorly discussed in the set of information and has no redundant/excess/unimportant details.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify all the important opinions majorly discussed in the set of information and list them with numbering.
2. Identify the important opinions present in the summary and list them with numbering.
3. Next identify how many important opinions are present in both summary and the set of information and list them with numbering
4. Next idenify the how many redundant/excess/unimportant details does the summary have and list them with numbering.
5. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on relevance and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = relevance_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("l_re_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Sentiment Consistency

In [None]:
import json

# Load the JSON file containing the product information and opinion summaries

with open("l.json", "r") as file:
    products = json.load(file)

sentiment_consistency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description: 
You will be given a set of information such as reviews, and average rating and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - None of the aspects present in summary have the same majority sentiment as in reviews.
<score>2</score> - Very few of the aspects present in summary have the same majority sentiment as in reviews.
<score>3</score> - Only around half of the aspects present in summary have the same majority sentiment as in reviews.
<score>4</score> - Most of the aspects present in summary have the same majority sentiment as in reviews.
<score>5</score> - All aspects present in summary have the same majority sentiment as in reviews.


Product Reviews: {reviews}

Average Rating: {average_rating}

Summary to Evaluate: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the aspects and their sentiment present in the summary and list them with numbering.
2. For the list of aspects identified, identify the majority sentiment from the reviews and list them with numbering.
3. Next identify how many aspect and sentiment match between reviews and summary from above and list them with numbering.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on sentiment preservation of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary

def evaluate_opinion_summary(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    
    return prompt


# Set up sampling parameters



# Process products in batches

batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    

    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**Product {i + j + 1} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file

with open("l_sc_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

# Specificity

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("l.json", "r") as file:
    products = json.load(file)

specificity_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Generic Opinion example: The battery is good.
Specific Opinion example: The battery lasts for more than 12 hours on a single charge.

Specificity - Specificity measures the level of detail and precision in the information and opinions presented in the summary. A specific summary provides concrete facts, measurements, or detailed descriptions about the product's features, performance, and user experiences. It avoids vague or general statements and instead offers precise information that gives readers a clear and thorough understanding of the product's characteristics and performance. 

Summaries should be penalized for including vague or generic statements and rewarded for providing detailed, precise information about the product and user experiences.

<score>1</score> - All the opinions presented in the summary are generic.
<score>2</score> - Most of the opinions presented are generic.
<score>3</score> - Only around half of the opinions presented are specific.
<score>4</score> - Most of the opinions presented in the summary are specific. Very few opinions are generic.
<score>5</score> - All the opinions presented in the summary are specific 


Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all the opinions presented.
2. Check if details are presented for the opinions. Classify each opinion as specific or generic.
3. Count the number of generic and specific occurrences.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on specificity of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = specificity_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("l_sp_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))
    

    

# Gemma summaries

## Aspect Covergae 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("g.json", "r") as file:
    products = json.load(file)
    
aspect_coverage_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating. You will then be given one summary written for the set of information. Your task is to rate the summary on one metric. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Aspect Coverage - Aspect Coverage measures how completely a summary captures the major features, characteristics, or attributes of a product that are prominently discussed in the original product information. Summaries should be penalized for missing any major aspects and rewarded for covering all important aspects thoroughly.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - Summary does not cover any important aspects present in the set of information.
<score>2</score> - Summary does not cover most of the important aspects present in the set of information.
<score>3</score> - Summary covers around half of the important aspects present in the set of information.
<score>4</score> - Summary covers most of the important aspects present in the set of information.
<score>5</score> - Summary covers all the important aspects discussed in the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the important aspects present in the set of information and list them with numbering.
2. Identify the important aspects present in the summary and list them with numbering.
3. Identify the important aspects covered by the summary that are present in the set of information and list them with numbering.
4. Calculate the total number of important aspects covered by the summary that are present in the set of information.
5. Calculate the total number of important aspects present in the set of information.
6. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation of how much is the coverage and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''
# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = aspect_coverage_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("g_ac_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Fluency 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("g.json", "r") as file:
    products = json.load(file)

fluency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Fluency : The quality of summary in terms of grammar, spelling, punctuation, capitalization, word choice, and sentence structure and should contain no errors. The summary should be easy to read, follow, comprehend and should contain no errors.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary is all garbled and does not make any sense.
<score>2</score> - The summary has grammatical errors that make it hard to understand or sound unnatural.
<score>3</score> - The summary has errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
<score>4</score> - The summary has very few errors, but it is easy to read, follow and comprehend.
<score>5</score> - The summary is extremely fluent and is easy to read, follow and comprehend.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the sentences presented in the summary and list them with numbering.
2. Go through each sentence and list down if there are any fluency problems.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on fluency of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = fluency_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("g_fl_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Coherence

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("g.json", "r") as file:
    products = json.load(file)

coherence_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Coherence - The collective quality of all sentences. The summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to a coherent body of information.
Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary lacks structure and logical flow, resulting in disjointed ideas and significant inconsistencies, making it confusing and challenging to follow.
<score>2</score> - The summary attempts coherence but struggles with occasional lapses in logic, clarity issues, and insufficiently connected ideas, leading to a somewhat disjointed presentation.
<score>3</score> - The summary displays a reasonable level of coherence with a logical sequence, yet occasional disruptions in flow and clarity, requiring some improvements for a smoother transition between ideas.
<score>4</score> - The summary demonstrates strong coherence, maintaining a clear and organized flow with effective transitions and minimal inconsistencies, effectively conveying main points with clarity and precision.
<score>5</score> - The summary showcases exceptional coherence with a flawless logical flow, impeccable transitions, and consistent clarity, presenting information in an impeccably organized and easily comprehensible manner.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary.
2. Check if everything is presented in a clear and logical order. Give a clear step-by-step explanation of what you found and what is lacking.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation on coherence of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = coherence_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("g_co_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Faithfulness

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("g.json", "r") as file:
    products = json.load(file)

faithfulness_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Faithfulness - Faithfulness measures the extent to which every piece of information mentioned in the summary is verifiable, supported, present, or can be reasonably inferred from the input. The input includes product title, description, key features, specifications, reviews, and average rating. Summaries should be penalized if they contain information that cannot be verified from the provided input or if they make broad generalizations that are not supported by the input data.

Following are the scores and the evaluation criteria according to which scores must be assigned.

<score>1</score> - The summary is for a different product and is irrelevant/unrelated to the given set of information.
<score>2</score> - The summary contains very few facts actually verifiable/supported/present/inferred from the set of information and contains a lot of hallucinated facts.
<score>3</score> - The summary contains more than one piece of information that is not verifiable/present/inferred from the set of information.
<score>4</score> - The summary contains only one piece of information that is not verifiable/supported/present/inferred from the the set of information.
<score>5</score> - Every piece of information present in the summary is verifiable/supported/present/inferred from the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary that is not verifiable/supported/present/inferred from the set of information. Give a clear step-by-step explanation of what you found.
2. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on faithfulness and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = faithfulness_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("g_fa_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))


    

# Relevance

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("g.json", "r") as file:
    products = json.load(file)

relevance_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Relevance - Relevance measures the selection of important information from the input, including product title, description, key features, specifications, reviews, and average rating. The summary should include only important and relevant information from the input. Summaries should not contain redundancies or excess information.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary misses all the important opinions majorly discussed in the set of information.
<score>2</score> - The summary misses most of the important opinions majorly discussed in the set of information or mostly has redundant/excess/unimportant details
<score>3</score> - The summary covers around half of the important opinions majorly discussed in the set of information. or contains redundant/excess/unimportant details.
<score>4</score> - The summary covers most of the important opinions majorly discussed in the set of information and has very less amount of redundant/excess/unimportant details.
<score>5</score> - The summary covers all the important opinions majorly discussed in the set of information and has no redundant/excess/unimportant details.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify all the important opinions majorly discussed in the set of information and list them with numbering.
2. Identify the important opinions present in the summary and list them with numbering.
3. Next identify how many important opinions are present in both summary and the set of information and list them with numbering
4. Next idenify the how many redundant/excess/unimportant details does the summary have and list them with numbering.
5. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on relevance and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = relevance_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("g_re_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Sentiment Consistency

In [None]:
import json

# Load the JSON file containing the product information and opinion summaries

with open("g.json", "r") as file:
    products = json.load(file)

sentiment_consistency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description: 
You will be given a set of information such as reviews, and average rating and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - None of the aspects present in summary have the same majority sentiment as in reviews.
<score>2</score> - Very few of the aspects present in summary have the same majority sentiment as in reviews.
<score>3</score> - Only around half of the aspects present in summary have the same majority sentiment as in reviews.
<score>4</score> - Most of the aspects present in summary have the same majority sentiment as in reviews.
<score>5</score> - All aspects present in summary have the same majority sentiment as in reviews.


Product Reviews: {reviews}

Average Rating: {average_rating}

Summary to Evaluate: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the aspects and their sentiment present in the summary and list them with numbering.
2. For the list of aspects identified, identify the majority sentiment from the reviews and list them with numbering.
3. Next identify how many aspect and sentiment match between reviews and summary from above and list them with numbering.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on sentiment preservation of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary

def evaluate_opinion_summary(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    
    return prompt


# Set up sampling parameters



# Process products in batches

batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    

    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**Product {i + j + 1} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file

with open("g_sc_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

# Specificity

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("g.json", "r") as file:
    products = json.load(file)

specificity_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Generic Opinion example: The battery is good.
Specific Opinion example: The battery lasts for more than 12 hours on a single charge.

Specificity - Specificity measures the level of detail and precision in the information and opinions presented in the summary. A specific summary provides concrete facts, measurements, or detailed descriptions about the product's features, performance, and user experiences. It avoids vague or general statements and instead offers precise information that gives readers a clear and thorough understanding of the product's characteristics and performance. 

Summaries should be penalized for including vague or generic statements and rewarded for providing detailed, precise information about the product and user experiences.

<score>1</score> - All the opinions presented in the summary are generic.
<score>2</score> - Most of the opinions presented are generic.
<score>3</score> - Only around half of the opinions presented are specific.
<score>4</score> - Most of the opinions presented in the summary are specific. Very few opinions are generic.
<score>5</score> - All the opinions presented in the summary are specific 


Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all the opinions presented.
2. Check if details are presented for the opinions. Classify each opinion as specific or generic.
3. Count the number of generic and specific occurrences.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on specificity of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = specificity_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("g_sp_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))
    

    

# viccuna summaries

## Aspect Covergae 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("v.json", "r") as file:
    products = json.load(file)
    
aspect_coverage_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating. You will then be given one summary written for the set of information. Your task is to rate the summary on one metric. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Aspect Coverage - Aspect Coverage measures how completely a summary captures the major features, characteristics, or attributes of a product that are prominently discussed in the original product information. Summaries should be penalized for missing any major aspects and rewarded for covering all important aspects thoroughly.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - Summary does not cover any important aspects present in the set of information.
<score>2</score> - Summary does not cover most of the important aspects present in the set of information.
<score>3</score> - Summary covers around half of the important aspects present in the set of information.
<score>4</score> - Summary covers most of the important aspects present in the set of information.
<score>5</score> - Summary covers all the important aspects discussed in the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the important aspects present in the set of information and list them with numbering.
2. Identify the important aspects present in the summary and list them with numbering.
3. Identify the important aspects covered by the summary that are present in the set of information and list them with numbering.
4. Calculate the total number of important aspects covered by the summary that are present in the set of information.
5. Calculate the total number of important aspects present in the set of information.
6. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation of how much is the coverage and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''
# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = aspect_coverage_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("v_ac_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Fluency 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("v.json", "r") as file:
    products = json.load(file)

fluency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Fluency : The quality of summary in terms of grammar, spelling, punctuation, capitalization, word choice, and sentence structure and should contain no errors. The summary should be easy to read, follow, comprehend and should contain no errors.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary is all garbled and does not make any sense.
<score>2</score> - The summary has grammatical errors that make it hard to understand or sound unnatural.
<score>3</score> - The summary has errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
<score>4</score> - The summary has very few errors, but it is easy to read, follow and comprehend.
<score>5</score> - The summary is extremely fluent and is easy to read, follow and comprehend.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the sentences presented in the summary and list them with numbering.
2. Go through each sentence and list down if there are any fluency problems.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on fluency of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = fluency_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("v_fl_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Coherence

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("v.json", "r") as file:
    products = json.load(file)

coherence_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Coherence - The collective quality of all sentences. The summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to a coherent body of information.
Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary lacks structure and logical flow, resulting in disjointed ideas and significant inconsistencies, making it confusing and challenging to follow.
<score>2</score> - The summary attempts coherence but struggles with occasional lapses in logic, clarity issues, and insufficiently connected ideas, leading to a somewhat disjointed presentation.
<score>3</score> - The summary displays a reasonable level of coherence with a logical sequence, yet occasional disruptions in flow and clarity, requiring some improvements for a smoother transition between ideas.
<score>4</score> - The summary demonstrates strong coherence, maintaining a clear and organized flow with effective transitions and minimal inconsistencies, effectively conveying main points with clarity and precision.
<score>5</score> - The summary showcases exceptional coherence with a flawless logical flow, impeccable transitions, and consistent clarity, presenting information in an impeccably organized and easily comprehensible manner.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary.
2. Check if everything is presented in a clear and logical order. Give a clear step-by-step explanation of what you found and what is lacking.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation on coherence of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = coherence_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("v_co_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Faithfulness

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("v.json", "r") as file:
    products = json.load(file)

faithfulness_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Faithfulness - Faithfulness measures the extent to which every piece of information mentioned in the summary is verifiable, supported, present, or can be reasonably inferred from the input. The input includes product title, description, key features, specifications, reviews, and average rating. Summaries should be penalized if they contain information that cannot be verified from the provided input or if they make broad generalizations that are not supported by the input data.

Following are the scores and the evaluation criteria according to which scores must be assigned.

<score>1</score> - The summary is for a different product and is irrelevant/unrelated to the given set of information.
<score>2</score> - The summary contains very few facts actually verifiable/supported/present/inferred from the set of information and contains a lot of hallucinated facts.
<score>3</score> - The summary contains more than one piece of information that is not verifiable/present/inferred from the set of information.
<score>4</score> - The summary contains only one piece of information that is not verifiable/supported/present/inferred from the the set of information.
<score>5</score> - Every piece of information present in the summary is verifiable/supported/present/inferred from the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary that is not verifiable/supported/present/inferred from the set of information. Give a clear step-by-step explanation of what you found.
2. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on faithfulness and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = faithfulness_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("v_fa_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))


    

# Relevance

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("v.json", "r") as file:
    products = json.load(file)

relevance_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Relevance - Relevance measures the selection of important information from the input, including product title, description, key features, specifications, reviews, and average rating. The summary should include only important and relevant information from the input. Summaries should not contain redundancies or excess information.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary misses all the important opinions majorly discussed in the set of information.
<score>2</score> - The summary misses most of the important opinions majorly discussed in the set of information or mostly has redundant/excess/unimportant details
<score>3</score> - The summary covers around half of the important opinions majorly discussed in the set of information. or contains redundant/excess/unimportant details.
<score>4</score> - The summary covers most of the important opinions majorly discussed in the set of information and has very less amount of redundant/excess/unimportant details.
<score>5</score> - The summary covers all the important opinions majorly discussed in the set of information and has no redundant/excess/unimportant details.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify all the important opinions majorly discussed in the set of information and list them with numbering.
2. Identify the important opinions present in the summary and list them with numbering.
3. Next identify how many important opinions are present in both summary and the set of information and list them with numbering
4. Next idenify the how many redundant/excess/unimportant details does the summary have and list them with numbering.
5. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on relevance and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = relevance_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("v_re_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Sentiment Consistency

In [None]:
import json

# Load the JSON file containing the product information and opinion summaries

with open("v.json", "r") as file:
    products = json.load(file)

sentiment_consistency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description: 
You will be given a set of information such as reviews, and average rating and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - None of the aspects present in summary have the same majority sentiment as in reviews.
<score>2</score> - Very few of the aspects present in summary have the same majority sentiment as in reviews.
<score>3</score> - Only around half of the aspects present in summary have the same majority sentiment as in reviews.
<score>4</score> - Most of the aspects present in summary have the same majority sentiment as in reviews.
<score>5</score> - All aspects present in summary have the same majority sentiment as in reviews.


Product Reviews: {reviews}

Average Rating: {average_rating}

Summary to Evaluate: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the aspects and their sentiment present in the summary and list them with numbering.
2. For the list of aspects identified, identify the majority sentiment from the reviews and list them with numbering.
3. Next identify how many aspect and sentiment match between reviews and summary from above and list them with numbering.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on sentiment preservation of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary

def evaluate_opinion_summary(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    
    return prompt


# Set up sampling parameters



# Process products in batches

batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    

    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**Product {i + j + 1} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file

with open("v_sc_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

# Specificity

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("v.json", "r") as file:
    products = json.load(file)

specificity_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Generic Opinion example: The battery is good.
Specific Opinion example: The battery lasts for more than 12 hours on a single charge.

Specificity - Specificity measures the level of detail and precision in the information and opinions presented in the summary. A specific summary provides concrete facts, measurements, or detailed descriptions about the product's features, performance, and user experiences. It avoids vague or general statements and instead offers precise information that gives readers a clear and thorough understanding of the product's characteristics and performance. 

Summaries should be penalized for including vague or generic statements and rewarded for providing detailed, precise information about the product and user experiences.

<score>1</score> - All the opinions presented in the summary are generic.
<score>2</score> - Most of the opinions presented are generic.
<score>3</score> - Only around half of the opinions presented are specific.
<score>4</score> - Most of the opinions presented in the summary are specific. Very few opinions are generic.
<score>5</score> - All the opinions presented in the summary are specific 


Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all the opinions presented.
2. Check if details are presented for the opinions. Classify each opinion as specific or generic.
3. Count the number of generic and specific occurrences.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on specificity of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = specificity_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("v_sp_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))
    

    

# Zephyr summaries

## Aspect Covergae 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("z.json", "r") as file:
    products = json.load(file)
    
aspect_coverage_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating. You will then be given one summary written for the set of information. Your task is to rate the summary on one metric. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Aspect Coverage - Aspect Coverage measures how completely a summary captures the major features, characteristics, or attributes of a product that are prominently discussed in the original product information. Summaries should be penalized for missing any major aspects and rewarded for covering all important aspects thoroughly.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - Summary does not cover any important aspects present in the set of information.
<score>2</score> - Summary does not cover most of the important aspects present in the set of information.
<score>3</score> - Summary covers around half of the important aspects present in the set of information.
<score>4</score> - Summary covers most of the important aspects present in the set of information.
<score>5</score> - Summary covers all the important aspects discussed in the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the important aspects present in the set of information and list them with numbering.
2. Identify the important aspects present in the summary and list them with numbering.
3. Identify the important aspects covered by the summary that are present in the set of information and list them with numbering.
4. Calculate the total number of important aspects covered by the summary that are present in the set of information.
5. Calculate the total number of important aspects present in the set of information.
6. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation of how much is the coverage and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''
# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = aspect_coverage_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("z_ac_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Fluency 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("z.json", "r") as file:
    products = json.load(file)

fluency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Fluency : The quality of summary in terms of grammar, spelling, punctuation, capitalization, word choice, and sentence structure and should contain no errors. The summary should be easy to read, follow, comprehend and should contain no errors.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary is all garbled and does not make any sense.
<score>2</score> - The summary has grammatical errors that make it hard to understand or sound unnatural.
<score>3</score> - The summary has errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
<score>4</score> - The summary has very few errors, but it is easy to read, follow and comprehend.
<score>5</score> - The summary is extremely fluent and is easy to read, follow and comprehend.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the sentences presented in the summary and list them with numbering.
2. Go through each sentence and list down if there are any fluency problems.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on fluency of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = fluency_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("z_fl_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Coherence

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("z.json", "r") as file:
    products = json.load(file)

coherence_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Coherence - The collective quality of all sentences. The summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to a coherent body of information.
Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary lacks structure and logical flow, resulting in disjointed ideas and significant inconsistencies, making it confusing and challenging to follow.
<score>2</score> - The summary attempts coherence but struggles with occasional lapses in logic, clarity issues, and insufficiently connected ideas, leading to a somewhat disjointed presentation.
<score>3</score> - The summary displays a reasonable level of coherence with a logical sequence, yet occasional disruptions in flow and clarity, requiring some improvements for a smoother transition between ideas.
<score>4</score> - The summary demonstrates strong coherence, maintaining a clear and organized flow with effective transitions and minimal inconsistencies, effectively conveying main points with clarity and precision.
<score>5</score> - The summary showcases exceptional coherence with a flawless logical flow, impeccable transitions, and consistent clarity, presenting information in an impeccably organized and easily comprehensible manner.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary.
2. Check if everything is presented in a clear and logical order. Give a clear step-by-step explanation of what you found and what is lacking.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation on coherence of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = coherence_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("z_co_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Faithfulness

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("z.json", "r") as file:
    products = json.load(file)

faithfulness_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Faithfulness - Faithfulness measures the extent to which every piece of information mentioned in the summary is verifiable, supported, present, or can be reasonably inferred from the input. The input includes product title, description, key features, specifications, reviews, and average rating. Summaries should be penalized if they contain information that cannot be verified from the provided input or if they make broad generalizations that are not supported by the input data.

Following are the scores and the evaluation criteria according to which scores must be assigned.

<score>1</score> - The summary is for a different product and is irrelevant/unrelated to the given set of information.
<score>2</score> - The summary contains very few facts actually verifiable/supported/present/inferred from the set of information and contains a lot of hallucinated facts.
<score>3</score> - The summary contains more than one piece of information that is not verifiable/present/inferred from the set of information.
<score>4</score> - The summary contains only one piece of information that is not verifiable/supported/present/inferred from the the set of information.
<score>5</score> - Every piece of information present in the summary is verifiable/supported/present/inferred from the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary that is not verifiable/supported/present/inferred from the set of information. Give a clear step-by-step explanation of what you found.
2. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on faithfulness and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = faithfulness_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("z_fa_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))


    

# Relevance

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("z.json", "r") as file:
    products = json.load(file)

relevance_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Relevance - Relevance measures the selection of important information from the input, including product title, description, key features, specifications, reviews, and average rating. The summary should include only important and relevant information from the input. Summaries should not contain redundancies or excess information.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary misses all the important opinions majorly discussed in the set of information.
<score>2</score> - The summary misses most of the important opinions majorly discussed in the set of information or mostly has redundant/excess/unimportant details
<score>3</score> - The summary covers around half of the important opinions majorly discussed in the set of information. or contains redundant/excess/unimportant details.
<score>4</score> - The summary covers most of the important opinions majorly discussed in the set of information and has very less amount of redundant/excess/unimportant details.
<score>5</score> - The summary covers all the important opinions majorly discussed in the set of information and has no redundant/excess/unimportant details.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify all the important opinions majorly discussed in the set of information and list them with numbering.
2. Identify the important opinions present in the summary and list them with numbering.
3. Next identify how many important opinions are present in both summary and the set of information and list them with numbering
4. Next idenify the how many redundant/excess/unimportant details does the summary have and list them with numbering.
5. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on relevance and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = relevance_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("z_re_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Sentiment Consistency

In [None]:
import json

# Load the JSON file containing the product information and opinion summaries

with open("z.json", "r") as file:
    products = json.load(file)

sentiment_consistency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description: 
You will be given a set of information such as reviews, and average rating and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - None of the aspects present in summary have the same majority sentiment as in reviews.
<score>2</score> - Very few of the aspects present in summary have the same majority sentiment as in reviews.
<score>3</score> - Only around half of the aspects present in summary have the same majority sentiment as in reviews.
<score>4</score> - Most of the aspects present in summary have the same majority sentiment as in reviews.
<score>5</score> - All aspects present in summary have the same majority sentiment as in reviews.


Product Reviews: {reviews}

Average Rating: {average_rating}

Summary to Evaluate: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the aspects and their sentiment present in the summary and list them with numbering.
2. For the list of aspects identified, identify the majority sentiment from the reviews and list them with numbering.
3. Next identify how many aspect and sentiment match between reviews and summary from above and list them with numbering.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on sentiment preservation of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary

def evaluate_opinion_summary(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    
    return prompt


# Set up sampling parameters



# Process products in batches

batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    

    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**Product {i + j + 1} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file

with open("z_sc_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

# Specificity

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("z.json", "r") as file:
    products = json.load(file)

specificity_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Generic Opinion example: The battery is good.
Specific Opinion example: The battery lasts for more than 12 hours on a single charge.

Specificity - Specificity measures the level of detail and precision in the information and opinions presented in the summary. A specific summary provides concrete facts, measurements, or detailed descriptions about the product's features, performance, and user experiences. It avoids vague or general statements and instead offers precise information that gives readers a clear and thorough understanding of the product's characteristics and performance. 

Summaries should be penalized for including vague or generic statements and rewarded for providing detailed, precise information about the product and user experiences.

<score>1</score> - All the opinions presented in the summary are generic.
<score>2</score> - Most of the opinions presented are generic.
<score>3</score> - Only around half of the opinions presented are specific.
<score>4</score> - Most of the opinions presented in the summary are specific. Very few opinions are generic.
<score>5</score> - All the opinions presented in the summary are specific 


Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all the opinions presented.
2. Check if details are presented for the opinions. Classify each opinion as specific or generic.
3. Count the number of generic and specific occurrences.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on specificity of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = specificity_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("z_sp_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))
    

    

# GPT 4o summaries

## Aspect Covergae 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("4o.json", "r") as file:
    products = json.load(file)
    
aspect_coverage_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating. You will then be given one summary written for the set of information. Your task is to rate the summary on one metric. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Aspect Coverage - Aspect Coverage measures how completely a summary captures the major features, characteristics, or attributes of a product that are prominently discussed in the original product information. Summaries should be penalized for missing any major aspects and rewarded for covering all important aspects thoroughly.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - Summary does not cover any important aspects present in the set of information.
<score>2</score> - Summary does not cover most of the important aspects present in the set of information.
<score>3</score> - Summary covers around half of the important aspects present in the set of information.
<score>4</score> - Summary covers most of the important aspects present in the set of information.
<score>5</score> - Summary covers all the important aspects discussed in the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the important aspects present in the set of information and list them with numbering.
2. Identify the important aspects present in the summary and list them with numbering.
3. Identify the important aspects covered by the summary that are present in the set of information and list them with numbering.
4. Calculate the total number of important aspects covered by the summary that are present in the set of information.
5. Calculate the total number of important aspects present in the set of information.
6. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation of how much is the coverage and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''
# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = aspect_coverage_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("4o_ac_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Fluency 

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("4o.json", "r") as file:
    products = json.load(file)

fluency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Fluency : The quality of summary in terms of grammar, spelling, punctuation, capitalization, word choice, and sentence structure and should contain no errors. The summary should be easy to read, follow, comprehend and should contain no errors.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary is all garbled and does not make any sense.
<score>2</score> - The summary has grammatical errors that make it hard to understand or sound unnatural.
<score>3</score> - The summary has errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
<score>4</score> - The summary has very few errors, but it is easy to read, follow and comprehend.
<score>5</score> - The summary is extremely fluent and is easy to read, follow and comprehend.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the sentences presented in the summary and list them with numbering.
2. Go through each sentence and list down if there are any fluency problems.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on fluency of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = fluency_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("4o_fl_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))



# Coherence

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("4o.json", "r") as file:
    products = json.load(file)

coherence_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Coherence - The collective quality of all sentences. The summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to a coherent body of information.
Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary lacks structure and logical flow, resulting in disjointed ideas and significant inconsistencies, making it confusing and challenging to follow.
<score>2</score> - The summary attempts coherence but struggles with occasional lapses in logic, clarity issues, and insufficiently connected ideas, leading to a somewhat disjointed presentation.
<score>3</score> - The summary displays a reasonable level of coherence with a logical sequence, yet occasional disruptions in flow and clarity, requiring some improvements for a smoother transition between ideas.
<score>4</score> - The summary demonstrates strong coherence, maintaining a clear and organized flow with effective transitions and minimal inconsistencies, effectively conveying main points with clarity and precision.
<score>5</score> - The summary showcases exceptional coherence with a flawless logical flow, impeccable transitions, and consistent clarity, presenting information in an impeccably organized and easily comprehensible manner.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary.
2. Check if everything is presented in a clear and logical order. Give a clear step-by-step explanation of what you found and what is lacking.
3. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation on coherence of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = coherence_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("4o_co_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Faithfulness

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("4o.json", "r") as file:
    products = json.load(file)

faithfulness_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Faithfulness - Faithfulness measures the extent to which every piece of information mentioned in the summary is verifiable, supported, present, or can be reasonably inferred from the input. The input includes product title, description, key features, specifications, reviews, and average rating. Summaries should be penalized if they contain information that cannot be verified from the provided input or if they make broad generalizations that are not supported by the input data.

Following are the scores and the evaluation criteria according to which scores must be assigned.

<score>1</score> - The summary is for a different product and is irrelevant/unrelated to the given set of information.
<score>2</score> - The summary contains very few facts actually verifiable/supported/present/inferred from the set of information and contains a lot of hallucinated facts.
<score>3</score> - The summary contains more than one piece of information that is not verifiable/present/inferred from the set of information.
<score>4</score> - The summary contains only one piece of information that is not verifiable/supported/present/inferred from the the set of information.
<score>5</score> - Every piece of information present in the summary is verifiable/supported/present/inferred from the set of information.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all pieces of information in the summary that is not verifiable/supported/present/inferred from the set of information. Give a clear step-by-step explanation of what you found.
2. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on faithfulness and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = faithfulness_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("4o_fa_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))


    

# Relevance

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("4o.json", "r") as file:
    products = json.load(file)

relevance_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Relevance - Relevance measures the selection of important information from the input, including product title, description, key features, specifications, reviews, and average rating. The summary should include only important and relevant information from the input. Summaries should not contain redundancies or excess information.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - The summary misses all the important opinions majorly discussed in the set of information.
<score>2</score> - The summary misses most of the important opinions majorly discussed in the set of information or mostly has redundant/excess/unimportant details
<score>3</score> - The summary covers around half of the important opinions majorly discussed in the set of information. or contains redundant/excess/unimportant details.
<score>4</score> - The summary covers most of the important opinions majorly discussed in the set of information and has very less amount of redundant/excess/unimportant details.
<score>5</score> - The summary covers all the important opinions majorly discussed in the set of information and has no redundant/excess/unimportant details.

Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify all the important opinions majorly discussed in the set of information and list them with numbering.
2. Identify the important opinions present in the summary and list them with numbering.
3. Next identify how many important opinions are present in both summary and the set of information and list them with numbering
4. Next idenify the how many redundant/excess/unimportant details does the summary have and list them with numbering.
5. Finally use the evaluation criteria to output only a single score within <score></score> tags.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on relevance and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = relevance_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("4o_re_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))




# Sentiment Consistency

In [None]:
import json

# Load the JSON file containing the product information and opinion summaries

with open("4o.json", "r") as file:
    products = json.load(file)

sentiment_consistency_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description: 
You will be given a set of information such as reviews, and average rating and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

Following are the scores and the evaluation criteria according to which scores must be assigned.
<score>1</score> - None of the aspects present in summary have the same majority sentiment as in reviews.
<score>2</score> - Very few of the aspects present in summary have the same majority sentiment as in reviews.
<score>3</score> - Only around half of the aspects present in summary have the same majority sentiment as in reviews.
<score>4</score> - Most of the aspects present in summary have the same majority sentiment as in reviews.
<score>5</score> - All aspects present in summary have the same majority sentiment as in reviews.


Product Reviews: {reviews}

Average Rating: {average_rating}

Summary to Evaluate: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Identify the aspects and their sentiment present in the summary and list them with numbering.
2. For the list of aspects identified, identify the majority sentiment from the reviews and list them with numbering.
3. Next identify how many aspect and sentiment match between reviews and summary from above and list them with numbering.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on sentiment preservation of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary

def evaluate_opinion_summary(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    
    return prompt


# Set up sampling parameters



# Process products in batches

batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    

    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**Product {i + j + 1} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file

with open("4o_sc_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

# Specificity

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("4o.json", "r") as file:
    products = json.load(file)

specificity_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Task Description:
You will be given a set of information such as product title, description, key features, specifications, reviews, and average rating.  and a corresponding summary. Make sure you understand the following evaluation metric very clearly. Your task is to rate the summary corresponding to the given set of information on the evaluation criteria.

Evaluation Criteria:
Generic Opinion example: The battery is good.
Specific Opinion example: The battery lasts for more than 12 hours on a single charge.

Specificity - Specificity measures the level of detail and precision in the information and opinions presented in the summary. A specific summary provides concrete facts, measurements, or detailed descriptions about the product's features, performance, and user experiences. It avoids vague or general statements and instead offers precise information that gives readers a clear and thorough understanding of the product's characteristics and performance. 

Summaries should be penalized for including vague or generic statements and rewarded for providing detailed, precise information about the product and user experiences.

<score>1</score> - All the opinions presented in the summary are generic.
<score>2</score> - Most of the opinions presented are generic.
<score>3</score> - Only around half of the opinions presented are specific.
<score>4</score> - Most of the opinions presented in the summary are specific. Very few opinions are generic.
<score>5</score> - All the opinions presented in the summary are specific 


Product Title: {product_title}

Description: {description}

Key Features: {key_features}

Specifications: {specifications}

Reviews: {reviews}

Average Rating: {average_rating}

Summary: {Product_Opinion_Summary}

Instructions:
Let's go step-by-step. Follow the following steps strictly while giving the response:

1. Go through the summary and list down all the opinions presented.
2. Check if details are presented for the opinions. Classify each opinion as specific or generic.
3. Count the number of generic and specific occurrences.
4. Finally use the previous information to output only a single score within <score></score> tags only using the evaluation criteria.

Note: Strictly give the score within <score></score> tags only e.g Score- <score>5</score>.

First give a detailed explanation only on specificity of the summary and then finally give a single score following the format: Score- <score>5</score> <|eot_id|><|start_header_id|>assistant<|end_header_id|>


'''

# Function to format specifications
def format_specifications(specifications):
    formatted_specs = []
    for spec in specifications:
        spec_str = f"{spec['key']}:\n"
        for value in spec['values']:
            spec_str += f"- {value['key']}: {value['value']}\n"
        formatted_specs.append(spec_str)
    return "\n".join(formatted_specs)

# Function to evaluate a single opinion summary
def evaluate_opinion_summary(product):
    product_title = product.get("product_title", "")
    description = product.get("description", "") if product.get("description") else "N/A"
    key_features = "\n".join(product.get("key_features", [])) if product.get("key_features") else "N/A"
    specifications = format_specifications(product.get("specifications", [])) if product.get("specifications") else "N/A"
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")

    prompt = specificity_prompt_template.format(
        product_title=product_title,
        description=description,
        key_features=key_features,
        specifications=specifications,
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )

    return prompt

# Set up sampling parameters


# Process products in batches
batch_size = 5
evaluation_results = []

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_opinion_summary(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# Save the evaluation results to a text file
with open("4o_sp_tj_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))
    

    

# Ours_Dep

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m3_50.json", "r") as file:
    products = json.load(file)

# System message and updated prompt template for sentiment consistency
system_msg = """You are a highly skilled expert in evaluating the sentiment consistency of product opinion summaries. Your expertise lies in analyzing summaries created from product reviews and ratings.

Your primary responsibilities are:

1. Carefully examine the provided product reviews and summary.
2. Meticulously follow all instructions in the prompt faithfully and truthfully.
3. Evaluate the summary's sentiment consistency with utmost accuracy and impartiality.
4. Assign a single score (1-5) based on the quality of sentiment consistency, adhering strictly to the given evaluation criteria.
5. Follow the specified format for all responses.

Your expert evaluation is crucial for maintaining the accuracy of sentiment representation in product summaries. Approach each evaluation with diligence and attention to detail.
"""

sentiment_consistency_prompt_template = f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>

Task Description: 

You will be given product reviews and an average rating. Next, you will be provided with one summary created using this product information. Your task is to carefully follow each evaluation criterion and instruction and always provide a faithful, truthful, and accurate output in the specified format. You must evaluate and assign a single score ranging from 1 to 5, to the summary, according to the metric called sentiment consistency.

Evaluation Criteria:

Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

<score>1</score> - Very Poor (Completely Inconsistent, 0-20% consistent)

- None of the aspects mentioned in the summary reflect the majority sentiment from the reviews.
- The summary completely misrepresents the overall sentiment of the product.
- Readers would get an entirely inaccurate impression of users' opinions.

<score>2</score> - Poor (Mostly Inconsistent, 21-50% consistent)

- Very few aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely misrepresents the sentiments expressed in the reviews.
- Readers would get a mostly inaccurate impression of users' opinions.

<score>3</score> - OK (Partially Consistent, 51-70% consistent)

- About half to two-thirds of the aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary partially represents the sentiments expressed in the reviews.
- Readers would get a mixed impression of users' opinions, with some accuracy.

<score>4</score> - Good (Mostly Consistent, 71-90% consistent)

- Most aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely represents the sentiments expressed in the reviews, with minor inconsistencies.
- Readers would get a mostly accurate impression of users' opinions.

<score>5</score> - Excellent (Completely Consistent, 91-100% consistent)

- All or nearly all aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary perfectly represents the sentiments expressed in the reviews.
- Readers would get a fully accurate impression of users' opinions.


Product Reviews: {{reviews}}

Average Rating: {{average_rating}}

Summary to Evaluate: {{Product_Opinion_Summary}}

Instructions:

Follow these steps strictly:

Step 1. Identify all aspects of the product mentioned in the reviews and determine the majority sentiment for each aspect. List these with numbering.
Step 2. Identify all aspects and their associated sentiments mentioned in the summary. List these with numbering.
Step 3. Compare the aspects and sentiments from steps 1 and 2. Identify which aspects in the summary accurately reflect the majority sentiment from the reviews. List these with numbering.
Step 4. Identify any aspects in the summary that misrepresent or fail to accurately convey the majority sentiment from the reviews. List these with numbering.
Step 5. Calculate the percentage of aspects in the summary that accurately reflect the majority sentiment from the reviews.
Step 6. Carefully match the observed sentiment consistency level to the descriptions in the evaluation criteria:

   - Completely Inconsistent (0-20% consistent): Score 1 (Very Poor)
   - Mostly Inconsistent (21-50% consistent): Score 2 (Poor)
   - Partially Consistent (51-70% consistent): Score 3 (OK)
   - Mostly Consistent (71-90% consistent): Score 4 (Good)
   - Completely Consistent (91-100% consistent): Score 5 (Excellent)

Step 7. Assign a score based on the sentiment consistency level, using the exact format shown in the evaluation criteria.

Your response should follow this structure:

1. Provide a detailed explanation of the sentiment consistency level of the summary, including specific examples of accurately and inaccurately represented sentiments for different aspects.
2. Clearly state which sentiment consistency level this falls into, referencing the evaluation criteria.
3. Assign a single score based on the sentiment consistency level, using the exact format shown below.

Score format: Score- <score>X</score>
Where X is the assigned score (1, 2, 3, 4, or 5) based on the sentiment consistency levels in the evaluation criteria.

Remember, your final score must always be presented in this exact format, with no deviations. <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary
def evaluate_sentiment_consistency(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")
    
    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    return prompt

# Set up sampling parameters



# Process products in batches
batch_size = 5
evaluation_results = []

# Start timing
start_time = time.time()

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_sentiment_consistency(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# End timing
end_time = time.time()

# Calculate total execution time
execution_time = end_time - start_time

# Save the evaluation results to a text file
with open("m3_sc_our_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

print(f"Total execution time: {execution_time:.2f} seconds")

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("m2.json", "r") as file:
    products = json.load(file)

# System message and updated prompt template for sentiment consistency
system_msg = """You are a highly skilled expert in evaluating the sentiment consistency of product opinion summaries. Your expertise lies in analyzing summaries created from product reviews and ratings.

Your primary responsibilities are:

1. Carefully examine the provided product reviews and summary.
2. Meticulously follow all instructions in the prompt faithfully and truthfully.
3. Evaluate the summary's sentiment consistency with utmost accuracy and impartiality.
4. Assign a single score (1-5) based on the quality of sentiment consistency, adhering strictly to the given evaluation criteria.
5. Follow the specified format for all responses.

Your expert evaluation is crucial for maintaining the accuracy of sentiment representation in product summaries. Approach each evaluation with diligence and attention to detail.
"""

sentiment_consistency_prompt_template = f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>

Task Description: 

You will be given product reviews and an average rating. Next, you will be provided with one summary created using this product information. Your task is to carefully follow each evaluation criterion and instruction and always provide a faithful, truthful, and accurate output in the specified format. You must evaluate and assign a single score ranging from 1 to 5, to the summary, according to the metric called sentiment consistency.

Evaluation Criteria:

Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

<score>1</score> - Very Poor (Completely Inconsistent, 0-20% consistent)

- None of the aspects mentioned in the summary reflect the majority sentiment from the reviews.
- The summary completely misrepresents the overall sentiment of the product.
- Readers would get an entirely inaccurate impression of users' opinions.

<score>2</score> - Poor (Mostly Inconsistent, 21-50% consistent)

- Very few aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely misrepresents the sentiments expressed in the reviews.
- Readers would get a mostly inaccurate impression of users' opinions.

<score>3</score> - OK (Partially Consistent, 51-70% consistent)

- About half to two-thirds of the aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary partially represents the sentiments expressed in the reviews.
- Readers would get a mixed impression of users' opinions, with some accuracy.

<score>4</score> - Good (Mostly Consistent, 71-90% consistent)

- Most aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely represents the sentiments expressed in the reviews, with minor inconsistencies.
- Readers would get a mostly accurate impression of users' opinions.

<score>5</score> - Excellent (Completely Consistent, 91-100% consistent)

- All or nearly all aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary perfectly represents the sentiments expressed in the reviews.
- Readers would get a fully accurate impression of users' opinions.


Product Reviews: {{reviews}}

Average Rating: {{average_rating}}

Summary to Evaluate: {{Product_Opinion_Summary}}

Instructions:

Follow these steps strictly:

Step 1. Identify all aspects of the product mentioned in the reviews and determine the majority sentiment for each aspect. List these with numbering.
Step 2. Identify all aspects and their associated sentiments mentioned in the summary. List these with numbering.
Step 3. Compare the aspects and sentiments from steps 1 and 2. Identify which aspects in the summary accurately reflect the majority sentiment from the reviews. List these with numbering.
Step 4. Identify any aspects in the summary that misrepresent or fail to accurately convey the majority sentiment from the reviews. List these with numbering.
Step 5. Calculate the percentage of aspects in the summary that accurately reflect the majority sentiment from the reviews.
Step 6. Carefully match the observed sentiment consistency level to the descriptions in the evaluation criteria:

   - Completely Inconsistent (0-20% consistent): Score 1 (Very Poor)
   - Mostly Inconsistent (21-50% consistent): Score 2 (Poor)
   - Partially Consistent (51-70% consistent): Score 3 (OK)
   - Mostly Consistent (71-90% consistent): Score 4 (Good)
   - Completely Consistent (91-100% consistent): Score 5 (Excellent)

Step 7. Assign a score based on the sentiment consistency level, using the exact format shown in the evaluation criteria.

Your response should follow this structure:

1. Provide a detailed explanation of the sentiment consistency level of the summary, including specific examples of accurately and inaccurately represented sentiments for different aspects.
2. Clearly state which sentiment consistency level this falls into, referencing the evaluation criteria.
3. Assign a single score based on the sentiment consistency level, using the exact format shown below.

Score format: Score- <score>X</score>
Where X is the assigned score (1, 2, 3, 4, or 5) based on the sentiment consistency levels in the evaluation criteria.

Remember, your final score must always be presented in this exact format, with no deviations. <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary
def evaluate_sentiment_consistency(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")
    
    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    return prompt

# Set up sampling parameters



# Process products in batches
batch_size = 5
evaluation_results = []

# Start timing
start_time = time.time()

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_sentiment_consistency(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# End timing
end_time = time.time()

# Calculate total execution time
execution_time = end_time - start_time

# Save the evaluation results to a text file
with open("m2_sc_our_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

print(f"Total execution time: {execution_time:.2f} seconds")

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("l.json", "r") as file:
    products = json.load(file)

# System message and updated prompt template for sentiment consistency
system_msg = """You are a highly skilled expert in evaluating the sentiment consistency of product opinion summaries. Your expertise lies in analyzing summaries created from product reviews and ratings.

Your primary responsibilities are:

1. Carefully examine the provided product reviews and summary.
2. Meticulously follow all instructions in the prompt faithfully and truthfully.
3. Evaluate the summary's sentiment consistency with utmost accuracy and impartiality.
4. Assign a single score (1-5) based on the quality of sentiment consistency, adhering strictly to the given evaluation criteria.
5. Follow the specified format for all responses.

Your expert evaluation is crucial for maintaining the accuracy of sentiment representation in product summaries. Approach each evaluation with diligence and attention to detail.
"""

sentiment_consistency_prompt_template = f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>

Task Description: 

You will be given product reviews and an average rating. Next, you will be provided with one summary created using this product information. Your task is to carefully follow each evaluation criterion and instruction and always provide a faithful, truthful, and accurate output in the specified format. You must evaluate and assign a single score ranging from 1 to 5, to the summary, according to the metric called sentiment consistency.

Evaluation Criteria:

Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

<score>1</score> - Very Poor (Completely Inconsistent, 0-20% consistent)

- None of the aspects mentioned in the summary reflect the majority sentiment from the reviews.
- The summary completely misrepresents the overall sentiment of the product.
- Readers would get an entirely inaccurate impression of users' opinions.

<score>2</score> - Poor (Mostly Inconsistent, 21-50% consistent)

- Very few aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely misrepresents the sentiments expressed in the reviews.
- Readers would get a mostly inaccurate impression of users' opinions.

<score>3</score> - OK (Partially Consistent, 51-70% consistent)

- About half to two-thirds of the aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary partially represents the sentiments expressed in the reviews.
- Readers would get a mixed impression of users' opinions, with some accuracy.

<score>4</score> - Good (Mostly Consistent, 71-90% consistent)

- Most aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely represents the sentiments expressed in the reviews, with minor inconsistencies.
- Readers would get a mostly accurate impression of users' opinions.

<score>5</score> - Excellent (Completely Consistent, 91-100% consistent)

- All or nearly all aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary perfectly represents the sentiments expressed in the reviews.
- Readers would get a fully accurate impression of users' opinions.


Product Reviews: {{reviews}}

Average Rating: {{average_rating}}

Summary to Evaluate: {{Product_Opinion_Summary}}

Instructions:

Follow these steps strictly:

Step 1. Identify all aspects of the product mentioned in the reviews and determine the majority sentiment for each aspect. List these with numbering.
Step 2. Identify all aspects and their associated sentiments mentioned in the summary. List these with numbering.
Step 3. Compare the aspects and sentiments from steps 1 and 2. Identify which aspects in the summary accurately reflect the majority sentiment from the reviews. List these with numbering.
Step 4. Identify any aspects in the summary that misrepresent or fail to accurately convey the majority sentiment from the reviews. List these with numbering.
Step 5. Calculate the percentage of aspects in the summary that accurately reflect the majority sentiment from the reviews.
Step 6. Carefully match the observed sentiment consistency level to the descriptions in the evaluation criteria:

   - Completely Inconsistent (0-20% consistent): Score 1 (Very Poor)
   - Mostly Inconsistent (21-50% consistent): Score 2 (Poor)
   - Partially Consistent (51-70% consistent): Score 3 (OK)
   - Mostly Consistent (71-90% consistent): Score 4 (Good)
   - Completely Consistent (91-100% consistent): Score 5 (Excellent)

Step 7. Assign a score based on the sentiment consistency level, using the exact format shown in the evaluation criteria.

Your response should follow this structure:

1. Provide a detailed explanation of the sentiment consistency level of the summary, including specific examples of accurately and inaccurately represented sentiments for different aspects.
2. Clearly state which sentiment consistency level this falls into, referencing the evaluation criteria.
3. Assign a single score based on the sentiment consistency level, using the exact format shown below.

Score format: Score- <score>X</score>
Where X is the assigned score (1, 2, 3, 4, or 5) based on the sentiment consistency levels in the evaluation criteria.

Remember, your final score must always be presented in this exact format, with no deviations. <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary
def evaluate_sentiment_consistency(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")
    
    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    return prompt

# Set up sampling parameters



# Process products in batches
batch_size = 5
evaluation_results = []

# Start timing
start_time = time.time()

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_sentiment_consistency(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# End timing
end_time = time.time()

# Calculate total execution time
execution_time = end_time - start_time

# Save the evaluation results to a text file
with open("l_sc_our_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

print(f"Total execution time: {execution_time:.2f} seconds")

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("g.json", "r") as file:
    products = json.load(file)

# System message and updated prompt template for sentiment consistency
system_msg = """You are a highly skilled expert in evaluating the sentiment consistency of product opinion summaries. Your expertise lies in analyzing summaries created from product reviews and ratings.

Your primary responsibilities are:

1. Carefully examine the provided product reviews and summary.
2. Meticulously follow all instructions in the prompt faithfully and truthfully.
3. Evaluate the summary's sentiment consistency with utmost accuracy and impartiality.
4. Assign a single score (1-5) based on the quality of sentiment consistency, adhering strictly to the given evaluation criteria.
5. Follow the specified format for all responses.

Your expert evaluation is crucial for maintaining the accuracy of sentiment representation in product summaries. Approach each evaluation with diligence and attention to detail.
"""

sentiment_consistency_prompt_template = f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>

Task Description: 

You will be given product reviews and an average rating. Next, you will be provided with one summary created using this product information. Your task is to carefully follow each evaluation criterion and instruction and always provide a faithful, truthful, and accurate output in the specified format. You must evaluate and assign a single score ranging from 1 to 5, to the summary, according to the metric called sentiment consistency.

Evaluation Criteria:

Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

<score>1</score> - Very Poor (Completely Inconsistent, 0-20% consistent)

- None of the aspects mentioned in the summary reflect the majority sentiment from the reviews.
- The summary completely misrepresents the overall sentiment of the product.
- Readers would get an entirely inaccurate impression of users' opinions.

<score>2</score> - Poor (Mostly Inconsistent, 21-50% consistent)

- Very few aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely misrepresents the sentiments expressed in the reviews.
- Readers would get a mostly inaccurate impression of users' opinions.

<score>3</score> - OK (Partially Consistent, 51-70% consistent)

- About half to two-thirds of the aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary partially represents the sentiments expressed in the reviews.
- Readers would get a mixed impression of users' opinions, with some accuracy.

<score>4</score> - Good (Mostly Consistent, 71-90% consistent)

- Most aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely represents the sentiments expressed in the reviews, with minor inconsistencies.
- Readers would get a mostly accurate impression of users' opinions.

<score>5</score> - Excellent (Completely Consistent, 91-100% consistent)

- All or nearly all aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary perfectly represents the sentiments expressed in the reviews.
- Readers would get a fully accurate impression of users' opinions.


Product Reviews: {{reviews}}

Average Rating: {{average_rating}}

Summary to Evaluate: {{Product_Opinion_Summary}}

Instructions:

Follow these steps strictly:

Step 1. Identify all aspects of the product mentioned in the reviews and determine the majority sentiment for each aspect. List these with numbering.
Step 2. Identify all aspects and their associated sentiments mentioned in the summary. List these with numbering.
Step 3. Compare the aspects and sentiments from steps 1 and 2. Identify which aspects in the summary accurately reflect the majority sentiment from the reviews. List these with numbering.
Step 4. Identify any aspects in the summary that misrepresent or fail to accurately convey the majority sentiment from the reviews. List these with numbering.
Step 5. Calculate the percentage of aspects in the summary that accurately reflect the majority sentiment from the reviews.
Step 6. Carefully match the observed sentiment consistency level to the descriptions in the evaluation criteria:

   - Completely Inconsistent (0-20% consistent): Score 1 (Very Poor)
   - Mostly Inconsistent (21-50% consistent): Score 2 (Poor)
   - Partially Consistent (51-70% consistent): Score 3 (OK)
   - Mostly Consistent (71-90% consistent): Score 4 (Good)
   - Completely Consistent (91-100% consistent): Score 5 (Excellent)

Step 7. Assign a score based on the sentiment consistency level, using the exact format shown in the evaluation criteria.

Your response should follow this structure:

1. Provide a detailed explanation of the sentiment consistency level of the summary, including specific examples of accurately and inaccurately represented sentiments for different aspects.
2. Clearly state which sentiment consistency level this falls into, referencing the evaluation criteria.
3. Assign a single score based on the sentiment consistency level, using the exact format shown below.

Score format: Score- <score>X</score>
Where X is the assigned score (1, 2, 3, 4, or 5) based on the sentiment consistency levels in the evaluation criteria.

Remember, your final score must always be presented in this exact format, with no deviations. <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary
def evaluate_sentiment_consistency(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")
    
    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    return prompt

# Set up sampling parameters



# Process products in batches
batch_size = 5
evaluation_results = []

# Start timing
start_time = time.time()

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_sentiment_consistency(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# End timing
end_time = time.time()

# Calculate total execution time
execution_time = end_time - start_time

# Save the evaluation results to a text file
with open("g_sc_our_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

print(f"Total execution time: {execution_time:.2f} seconds")

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("v.json", "r") as file:
    products = json.load(file)

# System message and updated prompt template for sentiment consistency
system_msg = """You are a highly skilled expert in evaluating the sentiment consistency of product opinion summaries. Your expertise lies in analyzing summaries created from product reviews and ratings.

Your primary responsibilities are:

1. Carefully examine the provided product reviews and summary.
2. Meticulously follow all instructions in the prompt faithfully and truthfully.
3. Evaluate the summary's sentiment consistency with utmost accuracy and impartiality.
4. Assign a single score (1-5) based on the quality of sentiment consistency, adhering strictly to the given evaluation criteria.
5. Follow the specified format for all responses.

Your expert evaluation is crucial for maintaining the accuracy of sentiment representation in product summaries. Approach each evaluation with diligence and attention to detail.
"""

sentiment_consistency_prompt_template = f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>

Task Description: 

You will be given product reviews and an average rating. Next, you will be provided with one summary created using this product information. Your task is to carefully follow each evaluation criterion and instruction and always provide a faithful, truthful, and accurate output in the specified format. You must evaluate and assign a single score ranging from 1 to 5, to the summary, according to the metric called sentiment consistency.

Evaluation Criteria:

Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

<score>1</score> - Very Poor (Completely Inconsistent, 0-20% consistent)

- None of the aspects mentioned in the summary reflect the majority sentiment from the reviews.
- The summary completely misrepresents the overall sentiment of the product.
- Readers would get an entirely inaccurate impression of users' opinions.

<score>2</score> - Poor (Mostly Inconsistent, 21-50% consistent)

- Very few aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely misrepresents the sentiments expressed in the reviews.
- Readers would get a mostly inaccurate impression of users' opinions.

<score>3</score> - OK (Partially Consistent, 51-70% consistent)

- About half to two-thirds of the aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary partially represents the sentiments expressed in the reviews.
- Readers would get a mixed impression of users' opinions, with some accuracy.

<score>4</score> - Good (Mostly Consistent, 71-90% consistent)

- Most aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely represents the sentiments expressed in the reviews, with minor inconsistencies.
- Readers would get a mostly accurate impression of users' opinions.

<score>5</score> - Excellent (Completely Consistent, 91-100% consistent)

- All or nearly all aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary perfectly represents the sentiments expressed in the reviews.
- Readers would get a fully accurate impression of users' opinions.


Product Reviews: {{reviews}}

Average Rating: {{average_rating}}

Summary to Evaluate: {{Product_Opinion_Summary}}

Instructions:

Follow these steps strictly:

Step 1. Identify all aspects of the product mentioned in the reviews and determine the majority sentiment for each aspect. List these with numbering.
Step 2. Identify all aspects and their associated sentiments mentioned in the summary. List these with numbering.
Step 3. Compare the aspects and sentiments from steps 1 and 2. Identify which aspects in the summary accurately reflect the majority sentiment from the reviews. List these with numbering.
Step 4. Identify any aspects in the summary that misrepresent or fail to accurately convey the majority sentiment from the reviews. List these with numbering.
Step 5. Calculate the percentage of aspects in the summary that accurately reflect the majority sentiment from the reviews.
Step 6. Carefully match the observed sentiment consistency level to the descriptions in the evaluation criteria:

   - Completely Inconsistent (0-20% consistent): Score 1 (Very Poor)
   - Mostly Inconsistent (21-50% consistent): Score 2 (Poor)
   - Partially Consistent (51-70% consistent): Score 3 (OK)
   - Mostly Consistent (71-90% consistent): Score 4 (Good)
   - Completely Consistent (91-100% consistent): Score 5 (Excellent)

Step 7. Assign a score based on the sentiment consistency level, using the exact format shown in the evaluation criteria.

Your response should follow this structure:

1. Provide a detailed explanation of the sentiment consistency level of the summary, including specific examples of accurately and inaccurately represented sentiments for different aspects.
2. Clearly state which sentiment consistency level this falls into, referencing the evaluation criteria.
3. Assign a single score based on the sentiment consistency level, using the exact format shown below.

Score format: Score- <score>X</score>
Where X is the assigned score (1, 2, 3, 4, or 5) based on the sentiment consistency levels in the evaluation criteria.

Remember, your final score must always be presented in this exact format, with no deviations. <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary
def evaluate_sentiment_consistency(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")
    
    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    return prompt

# Set up sampling parameters



# Process products in batches
batch_size = 5
evaluation_results = []

# Start timing
start_time = time.time()

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_sentiment_consistency(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# End timing
end_time = time.time()

# Calculate total execution time
execution_time = end_time - start_time

# Save the evaluation results to a text file
with open("v_sc_our_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

print(f"Total execution time: {execution_time:.2f} seconds")

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("z.json", "r") as file:
    products = json.load(file)

# System message and updated prompt template for sentiment consistency
system_msg = """You are a highly skilled expert in evaluating the sentiment consistency of product opinion summaries. Your expertise lies in analyzing summaries created from product reviews and ratings.

Your primary responsibilities are:

1. Carefully examine the provided product reviews and summary.
2. Meticulously follow all instructions in the prompt faithfully and truthfully.
3. Evaluate the summary's sentiment consistency with utmost accuracy and impartiality.
4. Assign a single score (1-5) based on the quality of sentiment consistency, adhering strictly to the given evaluation criteria.
5. Follow the specified format for all responses.

Your expert evaluation is crucial for maintaining the accuracy of sentiment representation in product summaries. Approach each evaluation with diligence and attention to detail.
"""

sentiment_consistency_prompt_template = f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>

Task Description: 

You will be given product reviews and an average rating. Next, you will be provided with one summary created using this product information. Your task is to carefully follow each evaluation criterion and instruction and always provide a faithful, truthful, and accurate output in the specified format. You must evaluate and assign a single score ranging from 1 to 5, to the summary, according to the metric called sentiment consistency.

Evaluation Criteria:

Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

<score>1</score> - Very Poor (Completely Inconsistent, 0-20% consistent)

- None of the aspects mentioned in the summary reflect the majority sentiment from the reviews.
- The summary completely misrepresents the overall sentiment of the product.
- Readers would get an entirely inaccurate impression of users' opinions.

<score>2</score> - Poor (Mostly Inconsistent, 21-50% consistent)

- Very few aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely misrepresents the sentiments expressed in the reviews.
- Readers would get a mostly inaccurate impression of users' opinions.

<score>3</score> - OK (Partially Consistent, 51-70% consistent)

- About half to two-thirds of the aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary partially represents the sentiments expressed in the reviews.
- Readers would get a mixed impression of users' opinions, with some accuracy.

<score>4</score> - Good (Mostly Consistent, 71-90% consistent)

- Most aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely represents the sentiments expressed in the reviews, with minor inconsistencies.
- Readers would get a mostly accurate impression of users' opinions.

<score>5</score> - Excellent (Completely Consistent, 91-100% consistent)

- All or nearly all aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary perfectly represents the sentiments expressed in the reviews.
- Readers would get a fully accurate impression of users' opinions.


Product Reviews: {{reviews}}

Average Rating: {{average_rating}}

Summary to Evaluate: {{Product_Opinion_Summary}}

Instructions:

Follow these steps strictly:

Step 1. Identify all aspects of the product mentioned in the reviews and determine the majority sentiment for each aspect. List these with numbering.
Step 2. Identify all aspects and their associated sentiments mentioned in the summary. List these with numbering.
Step 3. Compare the aspects and sentiments from steps 1 and 2. Identify which aspects in the summary accurately reflect the majority sentiment from the reviews. List these with numbering.
Step 4. Identify any aspects in the summary that misrepresent or fail to accurately convey the majority sentiment from the reviews. List these with numbering.
Step 5. Calculate the percentage of aspects in the summary that accurately reflect the majority sentiment from the reviews.
Step 6. Carefully match the observed sentiment consistency level to the descriptions in the evaluation criteria:

   - Completely Inconsistent (0-20% consistent): Score 1 (Very Poor)
   - Mostly Inconsistent (21-50% consistent): Score 2 (Poor)
   - Partially Consistent (51-70% consistent): Score 3 (OK)
   - Mostly Consistent (71-90% consistent): Score 4 (Good)
   - Completely Consistent (91-100% consistent): Score 5 (Excellent)

Step 7. Assign a score based on the sentiment consistency level, using the exact format shown in the evaluation criteria.

Your response should follow this structure:

1. Provide a detailed explanation of the sentiment consistency level of the summary, including specific examples of accurately and inaccurately represented sentiments for different aspects.
2. Clearly state which sentiment consistency level this falls into, referencing the evaluation criteria.
3. Assign a single score based on the sentiment consistency level, using the exact format shown below.

Score format: Score- <score>X</score>
Where X is the assigned score (1, 2, 3, 4, or 5) based on the sentiment consistency levels in the evaluation criteria.

Remember, your final score must always be presented in this exact format, with no deviations. <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary
def evaluate_sentiment_consistency(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")
    
    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    return prompt

# Set up sampling parameters



# Process products in batches
batch_size = 5
evaluation_results = []

# Start timing
start_time = time.time()

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_sentiment_consistency(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# End timing
end_time = time.time()

# Calculate total execution time
execution_time = end_time - start_time

# Save the evaluation results to a text file
with open("z_sc_our_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

print(f"Total execution time: {execution_time:.2f} seconds")

In [None]:

# Load the JSON file containing the product information and opinion summaries
with open("4o.json", "r") as file:
    products = json.load(file)

# System message and updated prompt template for sentiment consistency
system_msg = """You are a highly skilled expert in evaluating the sentiment consistency of product opinion summaries. Your expertise lies in analyzing summaries created from product reviews and ratings.

Your primary responsibilities are:

1. Carefully examine the provided product reviews and summary.
2. Meticulously follow all instructions in the prompt faithfully and truthfully.
3. Evaluate the summary's sentiment consistency with utmost accuracy and impartiality.
4. Assign a single score (1-5) based on the quality of sentiment consistency, adhering strictly to the given evaluation criteria.
5. Follow the specified format for all responses.

Your expert evaluation is crucial for maintaining the accuracy of sentiment representation in product summaries. Approach each evaluation with diligence and attention to detail.
"""

sentiment_consistency_prompt_template = f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>

Task Description: 

You will be given product reviews and an average rating. Next, you will be provided with one summary created using this product information. Your task is to carefully follow each evaluation criterion and instruction and always provide a faithful, truthful, and accurate output in the specified format. You must evaluate and assign a single score ranging from 1 to 5, to the summary, according to the metric called sentiment consistency.

Evaluation Criteria:

Sentiment Consistency - Sentiment Consistency measures how accurately the summary reflects the consensus sentiment of users for each aspect of the product as expressed in the reviews. The consensus sentiment (or majority sentiment) for an aspect is determined by the most common sentiment expressed by users, categorized as very positive, positive, neutral, negative, or very negative.

<score>1</score> - Very Poor (Completely Inconsistent, 0-20% consistent)

- None of the aspects mentioned in the summary reflect the majority sentiment from the reviews.
- The summary completely misrepresents the overall sentiment of the product.
- Readers would get an entirely inaccurate impression of users' opinions.

<score>2</score> - Poor (Mostly Inconsistent, 21-50% consistent)

- Very few aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely misrepresents the sentiments expressed in the reviews.
- Readers would get a mostly inaccurate impression of users' opinions.

<score>3</score> - OK (Partially Consistent, 51-70% consistent)

- About half to two-thirds of the aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary partially represents the sentiments expressed in the reviews.
- Readers would get a mixed impression of users' opinions, with some accuracy.

<score>4</score> - Good (Mostly Consistent, 71-90% consistent)

- Most aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary largely represents the sentiments expressed in the reviews, with minor inconsistencies.
- Readers would get a mostly accurate impression of users' opinions.

<score>5</score> - Excellent (Completely Consistent, 91-100% consistent)

- All or nearly all aspects in the summary accurately reflect the majority sentiment from the reviews.
- The summary perfectly represents the sentiments expressed in the reviews.
- Readers would get a fully accurate impression of users' opinions.


Product Reviews: {{reviews}}

Average Rating: {{average_rating}}

Summary to Evaluate: {{Product_Opinion_Summary}}

Instructions:

Follow these steps strictly:

Step 1. Identify all aspects of the product mentioned in the reviews and determine the majority sentiment for each aspect. List these with numbering.
Step 2. Identify all aspects and their associated sentiments mentioned in the summary. List these with numbering.
Step 3. Compare the aspects and sentiments from steps 1 and 2. Identify which aspects in the summary accurately reflect the majority sentiment from the reviews. List these with numbering.
Step 4. Identify any aspects in the summary that misrepresent or fail to accurately convey the majority sentiment from the reviews. List these with numbering.
Step 5. Calculate the percentage of aspects in the summary that accurately reflect the majority sentiment from the reviews.
Step 6. Carefully match the observed sentiment consistency level to the descriptions in the evaluation criteria:

   - Completely Inconsistent (0-20% consistent): Score 1 (Very Poor)
   - Mostly Inconsistent (21-50% consistent): Score 2 (Poor)
   - Partially Consistent (51-70% consistent): Score 3 (OK)
   - Mostly Consistent (71-90% consistent): Score 4 (Good)
   - Completely Consistent (91-100% consistent): Score 5 (Excellent)

Step 7. Assign a score based on the sentiment consistency level, using the exact format shown in the evaluation criteria.

Your response should follow this structure:

1. Provide a detailed explanation of the sentiment consistency level of the summary, including specific examples of accurately and inaccurately represented sentiments for different aspects.
2. Clearly state which sentiment consistency level this falls into, referencing the evaluation criteria.
3. Assign a single score based on the sentiment consistency level, using the exact format shown below.

Score format: Score- <score>X</score>
Where X is the assigned score (1, 2, 3, 4, or 5) based on the sentiment consistency levels in the evaluation criteria.

Remember, your final score must always be presented in this exact format, with no deviations. <|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

# Function to evaluate a single opinion summary
def evaluate_sentiment_consistency(product):
    reviews = "\n".join(product.get("reviews", [])) if product.get("reviews") else "N/A"
    average_rating = product.get("averageRating", "N/A")
    opinion_summary = product.get("product_opinion_summary", "")
    
    prompt = sentiment_consistency_prompt_template.format(
        reviews=reviews,
        average_rating=average_rating,
        Product_Opinion_Summary=opinion_summary
    )
    return prompt

# Set up sampling parameters



# Process products in batches
batch_size = 5
evaluation_results = []

# Start timing
start_time = time.time()

for i in range(0, len(products), batch_size):
    batch_products = products[i:i+batch_size]
    
    # Prepare prompts for the current batch
    prompts = [evaluate_sentiment_consistency(product) for product in batch_products]
    
    # Generate outputs using vLLM
    outputs = model.generate(prompts, sampling_params)
    
    # Process the outputs
    for j, output_group in enumerate(outputs):
        product_title = batch_products[j].get("product_title", "")
        for k, output in enumerate(output_group.outputs):
            generated_text = output.text
            evaluation_results.append(f"**{product_title} - Evaluation {k + 1}**\n\nResponse: {generated_text}\n")
    
    print(f"Processed batch {i // batch_size + 1} of {(len(products) - 1) // batch_size + 1}")

# End timing
end_time = time.time()

# Calculate total execution time
execution_time = end_time - start_time

# Save the evaluation results to a text file
with open("4o_sc_our_dep.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(evaluation_results))

print(f"Total execution time: {execution_time:.2f} seconds")