In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
import pandas as pd
import torch


tqdm.pandas()
# Use cuda if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Using a generative model with custom prompts depending on claim type
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model.to(device)

cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [5]:
from datasets import Dataset


train_df = pd.read_json("test_claims_quantemp.json")
val_df = pd.read_json("val_claims_quantemp.json")

train_df = pd.DataFrame([{'type': item['taxonomy_label'], 'claim': item['claim']} for item in train_df.to_dict(orient='records')])
val_df = pd.DataFrame([{'type': item['taxonomy_label'], 'claim': item['claim']} for item in val_df.to_dict(orient='records')])

# The following code is inspired by the ProgramFC project. We have implemented a simplified version that generates prompts based on task type.
# sources: https://github.com/mbzuai-nlp/ProgramFC/blob/main/models/prompts.py
# https://aclanthology.org/2023.acl-long.386.pdf

main_prompt = '''The task is to break down a given claim into its constituent statements. Here are some examples to illustrate the logic:

Example 1:
Claim:
"A patent for coronavirus was granted in 2018 to the Pirbright Institute UK, founded by Bill and Melinda Gates."
Results: 
1) "A patent for coronavirus was granted in 2018."
2) "The patent was granted to the Pirbright Institute UK."
3) "The Pirbright Institute UK was founded by Bill and Melinda Gates."

Example 2:
Claim:
"A video that went viral in July 2023 authentically depicted an ad in Japan that read \"Stop Zelenskyy, Stop War.\"" 
Results: 
1) "A video showed an ad in Japan that read \"Stop Zelenskyy, Stop War.\""
2) "The video went viral in July 2023."
3) "The video was authentic."

Example 4:
Claim:
"Arnold Schwarzenegger, born in 1947 in Austria, served as Republican Governor of California after a career as a bodybuilder and actor."
Results: 
1) "Arnold Schwarzenegger was born in 1947 in Austria."
2) "Arnold Schwarzenegger served as Republican Governor of California."
3) "Arnold Schwarzenegger had a career as a bodybuilder and actor."
4) "Arnold Schwarzenegger's career as a bodybuilder and actor preceded his tenure as Governor of California."

Example 7:
Claim: 
"Switching to a plant-based diet can help reduce the risk of heart disease, diabetes, and cancer by up to 50%%."
Results:
1) "Switching to a plant-based diet can help reduce the risk of heart disease by up to 50%%."
2) "Switching to a plant-based diet can help reduce the risk of diabetes by up to 50%%."
3) "Switching to a plant-based diet can help reduce the risk of cancer by up to 50%%."

'''

def add_decomp_prompt(claim, claim_type):
    if claim_type == "comparison":
        claim =  main_prompt + f'''
        Example 13:
        Claim: "Pepsi is preferred to Coke in blind taste tests, despite Coke being regarded as the more successful brand."
        Results: 
        1) "Blind tests have been conducted comparing Pepsi and Coke."
        2) "Pepsi is preferred to Coke in blind taste tests."
        3) "Coke is regarded as the more successful brand."
        
        Example 10:
        Claim:
        "Studies have shown that the average global temperature has increased by 1.2 degrees Celsius since the pre-industrial era."
        Results:
        1) "Studies have shown that the average global temperature has increased."
        2) "The increase is by 1.2 degrees Celsius."
        3) "The increase is since the pre-industrial era."
        
        Example 11:
        Claim:
        "According to recent polls, more Americans support the legalization of marijuana than oppose it. This is a significant shift from previous years."
        Results:
        1) "There have been recent polls on the legalization of marijuana."
        2) "Recent polls show that the majority of Americans support the legalization of marijuana."
        3) "The majority of Americans did not support the legalization of marijuana in previous years."
        
        ---
        Now, break down the following comparison claim into its smallest components: 
        
        Claim: {claim} 
        Results:
        '''
    elif claim_type == "interval":
        claim =  main_prompt + f'''
        Example 15:
        Claim:
        "An image of a red sky in Beijing was taken on March 23, 2023, during a sandstorm."
        Results: 
        1) "An image of a red sky in Beijing was taken on March 23, 2023."
        2) "There was a sandstorm in Beijing on March 23, 2023."
        3) "The image was taken during a sandstorm." 
        
        Example 8:
        Claim:
        "The stock market crashed in 1929, leading to the Great Depression."
        Results:
        1) "The stock market crashed in 1929."
        2) "The stock market crash led to the Great Depression."
        3) "The Great Depression followed the stock market crash of 1929."

        ---
        Now, break down the following interval claim into its smallest components: 
        
        Claim: {claim} 
        Results:
        '''  
    elif claim_type == "statistical":
        claim =  main_prompt + f'''
        
        Example 9:
        Claim:
        "Vaccines have been shown to reduce the risk of severe COVID-19 by 90%%."
        Results:
        1) "Vaccines have been shown to reduce the risk of severe COVID-19."
        2) "The reduction in risk is by 90%%."
        3) "The reduction in risk is for severe COVID-19."
        
        Example 6:
        Claim:
        "Police-recorded crimes against property in the EU increased in 2022: thefts rose by 17.9%%, robberies by 9.7%% and burglaries by 7.4%% compared with the previous year."
        Results:
        1) "Police-recorded crimes against property in the EU increased in 2022."
        2) "Thefts rose by 17.9%% compared to 2021."
        3) "Robberies rose by 9.7%% compared to 2021."
        4) "Burglaries rose by 7.4%% compared to 2021."
        
        Example 12:
        Claim:
        "President Bolsonaro is facing criticism for deforestation in the Amazon, which has increased by 25%% since he took office."
        Results:
        1) "President Bolsonaro is facing criticism for deforestation in the Amazon."
        2) "Deforestation in the Amazon has increased by 25%%."
        3) "The increase in deforestation is since President Bolsonaro took office."

        ---
        Now, break down the following statistical claim into its smallest components: 
        
        Claim: {claim} 
        Results:
        '''
    elif claim_type == "temporal":
        claim =  main_prompt + f'''
        Example 5:
        Claim:
        "In 2005, an estimated 1.5 million people from Alabama, Mississippi, and Louisiana fled their homes in the face of Hurricane Katrina."
        Results:
        1) "In 2005, Hurricane Katrina struck Alabama, Mississippi, and Louisiana."
        2) "At least 1.5 million fled their homes in the face of Hurricane Katrina."
        3) "An estimated 1.5 million people who fled their homes were from Alabama, Mississippi, and Louisiana."
        
        Example 3:
        Claim:
        "The 2022 Winter Olympics in Beijing were the first to feature a unified Korean team."
        Results: 
        1) "The 2022 Winter Olympics were held in Beijing."
        2) "The 2022 Winter Olympics featured a unified Korean team."
        3) "The 2022 unified Korean team was the first in Olympic history."
        
        ---
        Now, break down the following temporal claim into its smallest components: 
        
        Claim: {claim} 
        Results:
        '''
    else:
        claim = f"Decompose the following claim: {claim} "
    return claim  

def planner(df):
    # Apply the prompt to each claim and save in separate column
    df['prompt'] = df.progress_apply(lambda x: add_decomp_prompt(x['claim'], x['type']), axis=1)
    return df

train_df = planner(train_df)
val_df = planner(val_df)

print(train_df['prompt'].head()[0])

100%|██████████| 2495/2495 [00:00<00:00, 36694.35it/s]
100%|██████████| 3084/3084 [00:00<00:00, 31465.39it/s]

The task is to break down a given claim into its constituent statements. Here are some examples to illustrate the logic:

Example 1:
Claim:
"A patent for coronavirus was granted in 2018 to the Pirbright Institute UK, founded by Bill and Melinda Gates."
Results: 
1) "A patent for coronavirus was granted in 2018."
2) "The patent was granted to the Pirbright Institute UK."
3) "The Pirbright Institute UK was founded by Bill and Melinda Gates."

Example 2:
Claim:
"A video that went viral in July 2023 authentically depicted an ad in Japan that read "Stop Zelenskyy, Stop War."" 
Results: 
1) "A video showed an ad in Japan that read "Stop Zelenskyy, Stop War.""
2) "The video went viral in July 2023."
3) "The video was authentic."

Example 4:
Claim:
"Arnold Schwarzenegger, born in 1947 in Austria, served as Republican Governor of California after a career as a bodybuilder and actor."
Results: 
1) "Arnold Schwarzenegger was born in 1947 in Austria."
2) "Arnold Schwarzenegger served as Republican




In [6]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

train_tokenized = tokenizer(train_df['prompt'].tolist(), padding=True, max_length=1024, truncation=False, return_tensors="pt")
val_tokenized = tokenizer(val_df['prompt'].tolist(), padding=True, max_length=1024, truncation=False, return_tensors="pt")

# Decompose claims after tokenization
def decompose_claims(input_ids, attention_mask):
    results = []
    with torch.no_grad():
        for i in tqdm(range(input_ids.size(0)), desc="Generating Decompositions"):
            outputs = model.generate(input_ids=input_ids[i:i+1].to(device), attention_mask=attention_mask[i:i+1].to(device), 
                                    max_length=512,
                                    early_stopping=True,
                                    no_repeat_ngram_size=20,  
                                    )
            decomposed_claim = tokenizer.decode(outputs[0], skip_special_tokens=True)
            # Print initial claim and decomposition
            print(f"Initial Claim {i+1}/{input_ids.size(0)}: {train_df['claim'][i]}")
            print(f"Decomposed Claim {i+1}/{input_ids.size(0)}: {decomposed_claim}")
            results.append(decomposed_claim)
    return results

train_outputs = decompose_claims(train_tokenized.input_ids, train_tokenized.attention_mask)
val_outputs = decompose_claims(val_tokenized.input_ids, val_tokenized.attention_mask)

# Convert output back to plaintext
train_decompositions = [tokenizer.decode(output, skip_special_tokens=False) for output in train_outputs]
val_decompositions = [tokenizer.decode(output, skip_special_tokens=False) for output in val_outputs]

# Save results to CSV
train_df['decomposition'] = train_decompositions
val_df['decomposition'] = val_decompositions
train_df.to_csv("train_decompositions.csv", index=False)
val_df.to_csv("val_decompositions.csv", index=False)

# Print sample output
print(train_df.head())
print(val_df.head())

Generating Decompositions:   0%|          | 1/2495 [00:18<12:52:48, 18.59s/it]

Initial Claim 1/2495: "The non-partisan Congressional Budget Office concluded ObamaCare will cost the U.S. more than 800,000 jobs."
Decomposed Claim 1/2495: <pad> 1) "The non-partisan Congressional Budget Office concluded ObamaCare will cost the U.S. more than 800,000 jobs." 2) "The non-partisan Congressional Budget Office concluded ObamaCare will cost the U.S. more than 800,000 jobs."</s>


Generating Decompositions:   0%|          | 2/2495 [00:32<10:59:48, 15.88s/it]

Initial Claim 2/2495: "More than 50 percent of immigrants from (El Salvador, Guatemala and Honduras) use at least one major welfare program once they get here."
Decomposed Claim 2/2495: <pad> 1) "More than 50 percent of immigrants from (El Salvador, Guatemala and Honduras) use at least one major welfare program once they get here."</s>


Generating Decompositions:   0%|          | 3/2495 [00:45<10:02:25, 14.50s/it]

Initial Claim 3/2495: UK government banned Covid vaccine for children age 5-11
Decomposed Claim 3/2495: <pad> 1) "UK government banned Covid vaccine for children age 5-11." 2) "Covid vaccine is a vaccine that protects against viruses."</s>


Generating Decompositions:   0%|          | 4/2495 [01:03<10:58:29, 15.86s/it]

Initial Claim 4/2495: "[In 2014-2015] coverage for the rotavirus vaccine exceeded the 95% target and the pneumococcal vaccine reached 91.5%."
Decomposed Claim 4/2495: <pad> 1) "[In 2014-2015] coverage for the rotavirus vaccine exceeded the 95% target." 2) "The pneumococcal vaccine reached 91.5%." 3) "The rotavirus vaccine exceeded the 95% target."</s>


Generating Decompositions:   0%|          | 5/2495 [01:26<12:42:36, 18.38s/it]

Initial Claim 5/2495: In September 2021, the U.K. government announced its intention to create a new criminal offense of pet abduction.
Decomposed Claim 5/2495: <pad> 1) "The U.K. government announced its intention to create a new criminal offense of pet abduction in September 2021." 2) "The U.K. government announced its intention to create a new criminal offense of pet abduction in September 2021." 3) "The U.K. government announced its intention to create a new criminal offense of pet abduction in September 2021."</s>


Generating Decompositions:   0%|          | 6/2495 [01:38<11:15:54, 16.29s/it]

Initial Claim 6/2495: Labour took £1.5 million from Just Stop Oil.
Decomposed Claim 6/2495: <pad> 1) "Just Stop Oil" raised £1.5 million for Labour. 2) "Labour took £1.5 million from Just Stop Oil."</s>


Generating Decompositions:   0%|          | 7/2495 [01:58<12:00:31, 17.38s/it]

Initial Claim 7/2495: McDonald's has announced that everyone who shares this link will receive 2 FREE McFamily box
Decomposed Claim 7/2495: <pad> 1) "McDonald's has announced that everyone who shares this link will receive 2 FREE McFamily box." 2) "Everyone who shares this link will receive 2 FREE McFamily box." 3) "Everyone who shares this link will receive 2 FREE McFamily box."</s>


Generating Decompositions:   0%|          | 8/2495 [02:10<10:58:31, 15.89s/it]

Initial Claim 8/2495: "NASA just announced a 100-foot-wide fissure-crack just opened up Yellowstone volcano in 24 hours.”
Decomposed Claim 8/2495: <pad> 1) "NASA just announced a 100-foot-wide fissure-crack just opened up Yellowstone volcano in 24 hours."</s>


Generating Decompositions:   0%|          | 9/2495 [02:21<9:54:35, 14.35s/it] 

Initial Claim 9/2495: Photo shows woman watching climate protest outside Ivanka Trump’s house in 2017
Decomposed Claim 9/2495: <pad> 1) "Photo shows woman watching climate protest outside Ivanka Trump’s house in 2017"</s>


Generating Decompositions:   0%|          | 10/2495 [02:36<10:03:16, 14.57s/it]

Initial Claim 10/2495: The 1974 comedy “Young Frankenstein” directly inspired the title for rock band Aerosmith’s song “Walk This Way.”
Decomposed Claim 10/2495: <pad> 1) "Young Frankenstein" is a 1974 comedy film. 2) "Walk This Way" is a song by Aerosmith. 3) "Young Frankenstein" is a film.</s>


Generating Decompositions:   0%|          | 11/2495 [02:54<10:46:41, 15.62s/it]

Initial Claim 11/2495: A total of 58 peer-reviewed papers published in the first half of 2017 conclude that global warming is a myth.
Decomposed Claim 11/2495: <pad> 1) "A total of 58 peer-reviewed papers published in the first half of 2017 conclude that global warming is a myth." 2) "A total of 58 peer-reviewed papers published in the first half of 2017 conclude that global warming is a myth."</s>


Generating Decompositions:   0%|          | 12/2495 [03:05<9:46:29, 14.17s/it] 

Initial Claim 12/2495: "Senator Clinton tried to spend $1 million on the Woodstock Concert Museum."
Decomposed Claim 12/2495: <pad> 1) "Senator Clinton tried to spend $1 million on the Woodstock Concert Museum."</s>


Generating Decompositions:   1%|          | 13/2495 [03:21<10:04:12, 14.61s/it]

Initial Claim 13/2495: "When you throw 23 million people off of health insurance -- people with cancer, people with heart disease, people with diabetes -- thousands of people will die. … This is study after study making this point."
Decomposed Claim 13/2495: <pad> 1) "When you throw 23 million people off of health insurance -- people with cancer, people with heart disease, people with diabetes -- thousands of people will die." 2) "This study after study making this point."</s>


Generating Decompositions:   1%|          | 14/2495 [03:58<14:50:01, 21.52s/it]

Initial Claim 14/2495: Says Arizona, Missouri and Texas residents have a two-pet limit, so the public must "surrender their third pet to the Humane Society."
Decomposed Claim 14/2495: <pad> 1) "Arizona, Missouri and Texas residents have a two-pet limit, so the public must surrender their third pet to the Humane Society." 2) "Arizona, Missouri and Texas residents have a two-pet limit, so the public must surrender their third pet to the Humane Society." 3) "Arizona, Missouri and Texas residents have a two-pet limit, so the public must surrender their third pet to the Humane Society."</s>


Generating Decompositions:   1%|          | 15/2495 [04:20<14:47:39, 21.48s/it]

Initial Claim 15/2495: "We have an unemployment (rate of) 1.5 in Boone County, and we can’t find enough workers."
Decomposed Claim 15/2495: <pad> 1) "We have an unemployment rate of 1.5 in Boone County." 2) "We can’t find enough workers."</s>


Generating Decompositions:   1%|          | 16/2495 [04:40<14:32:19, 21.11s/it]

Initial Claim 16/2495: "Computer models show Irma destroying New York City on Sept. 10."
Decomposed Claim 16/2495: <pad> 1) "Computer models show Irma destroying New York City on Sept. 10."</s>


Generating Decompositions:   1%|          | 17/2495 [05:01<14:31:49, 21.11s/it]

Initial Claim 17/2495: Antifa activists announced in August 2017 that they would protest the Sturgis Motorcycle Rally in 2018.
Decomposed Claim 17/2495: <pad> 1) "Antifa activists announced in August 2017 that they would protest the Sturgis Motorcycle Rally in 2018."</s>
