In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
import pandas as pd
import torch


tqdm.pandas()
# Use cuda if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Using a generative model with custom prompts depending on claim type
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model.to(device)

In [None]:
from datasets import Dataset


train_df = pd.read_json("test_claims_quantemp.json")
val_df = pd.read_json("val_claims_quantemp.json")

train_df = pd.DataFrame([{'type': item['taxonomy_label'], 'claim': item['claim']} for item in train_df.to_dict(orient='records')])
val_df = pd.DataFrame([{'type': item['taxonomy_label'], 'claim': item['claim']} for item in val_df.to_dict(orient='records')])

# The following code is inspired by the ProgramFC project. We have implemented a simplified version that generates prompts based on task type.
# sources: https://github.com/mbzuai-nlp/ProgramFC/blob/main/models/prompts.py
# https://aclanthology.org/2023.acl-long.386.pdf
# used copilot to assist in generating the prompts


main_prompt = '''The task is to break down a given claim into its constituent statements. Here are some examples to illustrate the logic:

Example 1:
Claim:
"A patent for coronavirus was granted in 2018 to the Pirbright Institute UK, founded by Bill and Melinda Gates."
Results: 
1. "A patent for coronavirus was granted in 2018."\n
2. "The patent was granted to the Pirbright Institute UK."\n
3. "The Pirbright Institute UK was founded by Bill and Melinda Gates."\n

Example 2:
Claim:
"A video that went viral in July 2023 authentically depicted an ad in Japan that read \"Stop Zelenskyy, Stop War.\"" 
Results: 
1. "A video showed an ad in Japan that read \"Stop Zelenskyy, Stop War.\""\n
2. "The video went viral in July 2023."\n
3. "The video was authentic."\n

Example 4:
Claim:
"Arnold Schwarzenegger, born in 1947 in Austria, served as Republican Governor of California after a career as a bodybuilder and actor."
Results: 
1. "Arnold Schwarzenegger was born in 1947 in Austria."\n
2. "Arnold Schwarzenegger served as Republican Governor of California."\n
3. "Arnold Schwarzenegger had a career as a bodybuilder and actor."\n
4. "Arnold Schwarzenegger's career as a bodybuilder and actor preceded his tenure as Governor of California."\n

Example 7:
Claim: 
"Switching to a plant-based diet can help reduce the risk of heart disease, diabetes, and cancer by up to 50%%."
Results:
1. "Switching to a plant-based diet can help reduce the risk of heart disease by up to 50%%."\n
2. "Switching to a plant-based diet can help reduce the risk of diabetes by up to 50%%."\n
3. "Switching to a plant-based diet can help reduce the risk of cancer by up to 50%%."\n

'''

def add_decomp_prompt(claim, claim_type):
    if claim_type == "comparison":
        claim =  main_prompt + f'''
        Example 13:
        Claim: "Pepsi is preferred to Coke in blind taste tests, despite Coke being regarded as the more successful brand."
        Results: 
        1. "Blind tests have been conducted comparing Pepsi and Coke."\n
        2. "Pepsi is preferred to Coke in blind taste tests."\n
        3. "Coke is regarded as the more successful brand."\n
        
        Example 10:
        Claim:
        "Studies have shown that the average global temperature has increased by 1.2 degrees Celsius since the pre-industrial era."
        Results:
        1. "Studies have shown that the average global temperature has increased."\n
        2. "The increase is by 1.2 degrees Celsius."\n
        3. "The increase is since the pre-industrial era."\n
        
        Example 11:
        Claim:
        "According to recent polls, more Americans support the legalization of marijuana than oppose it. This is a significant shift from previous years."
        Results:
        1. "There have been recent polls on the legalization of marijuana."\n
        2. "Recent polls show that the majority of Americans support the legalization of marijuana."\n
        3. "The majority of Americans did not support the legalization of marijuana in previous years."\n
        
        ---
        Now, break down the following comparison claim into its smallest components, ensuring there are no duplicates: 
        
        Claim: {claim} 
        Results:
        '''
    elif claim_type == "interval":
        claim =  main_prompt + f'''
        Example 15:
        Claim:
        "An image of a red sky in Beijing was taken on March 23, 2023, during a sandstorm."\n
        Results: 
        1. "An image of a red sky in Beijing was taken on March 23, 2023."\n
        2. "There was a sandstorm in Beijing on March 23, 2023."\n
        3. "The image was taken during a sandstorm." \n
        
        Example 8:
        Claim:
        "The stock market crashed in 1929, leading to the Great Depression."
        Results:
        1. "The stock market crashed in 1929."\n
        2. "The stock market crash led to the Great Depression."\n
        3. "The Great Depression followed the stock market crash of 1929."\n

        ---
        Now, break down the following interval claim into its smallest components: 
        
        Claim: {claim} 
        Results:
        '''  
    elif claim_type == "statistical":
        claim =  main_prompt + f'''
        
        Example 9:
        Claim:
        "Vaccines have been shown to reduce the risk of severe COVID-19 by 90%%."
        Results:
        1. "Vaccines have been shown to reduce the risk of severe COVID-19."\n
        2. "The reduction in risk is by 90%%."\n
        3. "The reduction in risk is for severe COVID-19."\n
        
        Example 6:
        Claim:
        "Police-recorded crimes against property in the EU increased in 2022: thefts rose by 17.9%%, robberies by 9.7%% and burglaries by 7.4%% compared with the previous year."
        Results:
        1. "Police-recorded crimes against property in the EU increased in 2022."\n
        2. "Thefts rose by 17.9%% compared to 2021."\n
        3. "Robberies rose by 9.7%% compared to 2021."\n
        4. "Burglaries rose by 7.4%% compared to 2021."\n
        
        Example 12:
        Claim:
        "President Bolsonaro is facing criticism for deforestation in the Amazon, which has increased by 25%% since he took office."
        Results:
        1. "President Bolsonaro is facing criticism for deforestation in the Amazon."\n
        2. "Deforestation in the Amazon has increased by 25%%."\n
        3. "The increase in deforestation is since President Bolsonaro took office."\n

        ---
        Now, break down the following statistical claim into its smallest components: 
        
        Claim: {claim} 
        Results:
        '''
    elif claim_type == "temporal":
        claim =  main_prompt + f'''
        Example 5:
        Claim:
        "In 2005, an estimated 1.5 million people from Alabama, Mississippi, and Louisiana fled their homes in the face of Hurricane Katrina."
        Results:
        1. "In 2005, Hurricane Katrina struck Alabama, Mississippi, and Louisiana."\n
        2. "At least 1.5 million fled their homes in the face of Hurricane Katrina."\n
        3. "An estimated 1.5 million people who fled their homes were from Alabama, Mississippi, and Louisiana."\n
        
        Example 3:
        Claim:
        "The 2022 Winter Olympics in Beijing were the first to feature a unified Korean team."
        Results: 
        1. "The 2022 Winter Olympics were held in Beijing."\n
        2. "The 2022 Winter Olympics featured a unified Korean team."\n
        3. "The 2022 unified Korean team was the first in Olympic history."\n
        
        ---
        Now, break down the following temporal claim into its smallest components, ensuring there are no duplicates: 
        
        Claim: {claim} 
        Results:
        '''
    else:
        claim = f"Decompose the following claim: {claim} "
    return claim  

def planner(df):
    # Apply the prompt to each claim and save in separate column
    df['prompt'] = df.progress_apply(lambda x: add_decomp_prompt(x['claim'], x['type']), axis=1)
    return df

train_df = planner(train_df)
val_df = planner(val_df)

print(train_df['prompt'].head()[0])

100%|██████████| 2495/2495 [00:00<00:00, 35644.11it/s]
100%|██████████| 3084/3084 [00:00<00:00, 21055.50it/s]

The task is to break down a given claim into its constituent statements. Here are some examples to illustrate the logic:

Example 1:
Claim:
"A patent for coronavirus was granted in 2018 to the Pirbright Institute UK, founded by Bill and Melinda Gates."
Results: 
1. "A patent for coronavirus was granted in 2018."

2. "The patent was granted to the Pirbright Institute UK."

3. "The Pirbright Institute UK was founded by Bill and Melinda Gates."


Example 2:
Claim:
"A video that went viral in July 2023 authentically depicted an ad in Japan that read "Stop Zelenskyy, Stop War."" 
Results: 
1. "A video showed an ad in Japan that read "Stop Zelenskyy, Stop War.""

2. "The video went viral in July 2023."

3. "The video was authentic."


Example 4:
Claim:
"Arnold Schwarzenegger, born in 1947 in Austria, served as Republican Governor of California after a career as a bodybuilder and actor."
Results: 
1. "Arnold Schwarzenegger was born in 1947 in Austria."

2. "Arnold Schwarzenegger served as Rep




In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

train_tokenized = tokenizer(train_df['prompt'].tolist(), padding=True, max_length=1024, truncation=False, return_tensors="pt")
val_tokenized = tokenizer(val_df['prompt'].tolist(), padding=True, max_length=1024, truncation=False, return_tensors="pt")

# Decompose claims after tokenization
def decompose_claims(input_ids, attention_mask):
    results = []
    with torch.no_grad():
        for i in tqdm(range(input_ids.size(0)), desc="Generating Decompositions"):
            outputs = model.generate(input_ids=input_ids[i:i+1].to(device), attention_mask=attention_mask[i:i+1].to(device), 
                                    max_length=1024,
                                    num_beams=5
                                    )
            decomposed_claim = tokenizer.decode(outputs[0], skip_special_tokens=True)
            # Print initial claim and decomposition
            print(f"Initial Claim {i+1}/{input_ids.size(0)}: {train_df['claim'][i]}")
            print(f"Decomposed Claim {i+1}/{input_ids.size(0)}: {decomposed_claim}")
            results.append(decomposed_claim)
    return results

train_outputs = decompose_claims(train_tokenized.input_ids, train_tokenized.attention_mask)
val_outputs = decompose_claims(val_tokenized.input_ids, val_tokenized.attention_mask)

# Convert output back to plaintext
train_decompositions = [tokenizer.decode(output, skip_special_tokens=True) for output in train_outputs]
val_decompositions = [tokenizer.decode(output, skip_special_tokens=True) for output in val_outputs]

# Save results to CSV
train_df['decomposition'] = train_decompositions
val_df['decomposition'] = val_decompositions
train_df.to_csv("train_decompositions.csv", index=False)
val_df.to_csv("val_decompositions.csv", index=False)

# Print sample output
print(train_df.head())
print(val_df.head())

Generating Decompositions:   0%|          | 0/2495 [00:00<?, ?it/s]