In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
import pandas as pd
import torch


tqdm.pandas()
# Use cuda if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Using a generative model with custom prompts depending on claim type
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model.to(device)

cuda


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [2]:
from datasets import Dataset


train_df = pd.read_json("test_claims_quantemp.json")
val_df = pd.read_json("val_claims_quantemp.json")

train_df = pd.DataFrame([{'type': item['taxonomy_label'], 'claim': item['claim']} for item in train_df.to_dict(orient='records')])
val_df = pd.DataFrame([{'type': item['taxonomy_label'], 'claim': item['claim']} for item in val_df.to_dict(orient='records')])

# The following code is inspired by the ProgramFC project. We have implemented a simplified version that generates prompts based on task type.
# sources: https://github.com/mbzuai-nlp/ProgramFC/blob/main/models/prompts.py
# https://aclanthology.org/2023.acl-long.386.pdf

main_prompt = '''The task is to break down a given claim into its constituent statements. Here are some examples to illustrate the logic:

Example 1:
Claim:
"A patent for coronavirus was granted in 2018 to the Pirbright Institute UK, founded by Bill and Melinda Gates."
Results: 
1) "A patent for coronavirus was granted in 2018."
2) "The patent was granted to the Pirbright Institute UK."
3) "The Pirbright Institute UK was founded by Bill and Melinda Gates."

Example 2:
Claim:
"A video that went viral in July 2023 authentically depicted an ad in Japan that read \"Stop Zelenskyy, Stop War.\"" 
Results: 
1) "A video showed an ad in Japan that read \"Stop Zelenskyy, Stop War.\""
2) "The video went viral in July 2023."
3) "The video was authentic."

Example 3:
Claim:
"In the historic 2020 US election, Joe Biden defeated Donald Trump to become the 46th President of the United States."
Results:
1) "The 2020 US election was historic."
2) "Joe Biden defeated Donald Trump in the 2020 US election."
3) "Joe Biden became the 46th President of the United States."

Example 4:
Claim:
"Arnold Schwarzenegger, born in 1947 in Austria, served as Republican Governor of California after a career as a bodybuilder and actor."
Results: 
1) "Arnold Schwarzenegger was born in 1947 in Austria."
2) "Arnold Schwarzenegger served as Republican Governor of California."
3) "Arnold Schwarzenegger had a career as a bodybuilder and actor."
4) "Arnold Schwarzenegger's career as a bodybuilder and actor preceded his tenure as Governor of California."

Example 19:
Claim:
"According to a 2023 report, the number of billionaires in the world has increased by 25%% since 2020."
Results:
1) "A report was published in 2023 on the number of billionaires in the world."
2) "The number of billionaires in the world has increased between 2020 and 2023."
3) "The increase in the number of billionaires is by 25%%."


Example 17:
Claim:
"Japanese citizens boycotting goods made in USA over nuclear bombing in 1945"
Results:
1) "Japanese citizens are boycotting goods made in the USA."
2) "There was a nuclear bombing in 1945."
3) "The boycott is due to the nuclear bombing in 1945."

'''

def add_decomp_prompt(claim, claim_type):
    if claim_type == "comparison":
        claim =  main_prompt + f'''
Example 13:
Claim: "Pepsi is preferred to Coke in blind taste tests, despite Coke being regarded as the more successful brand."
Results: 
1) "Blind tests have been conducted comparing Pepsi and Coke."
2) "Pepsi is preferred to Coke in blind taste tests."
3) "Coke is regarded as the more successful brand."

Example 10:
Claim:
"Studies have shown that the average global temperature has increased by 1.2 degrees Celsius since the pre-industrial era."
Results:
1) "Studies have shown that the average global temperature has increased."
2) "The increase is by 1.2 degrees Celsius."
3) "The increase is since the pre-industrial era."

Example 11:
Claim:
"According to recent polls, more Americans support the legalization of marijuana than oppose it. This is a significant shift from previous years."
Results:
1) "Recent polls show that the majority of Americans support the legalization of marijuana."
2) "More Americans support the legalization of marijuana than oppose it."
3) "There has been a significant shift in public opinion from previous years."

---
Now, break down the following comparison claim into its smallest, factual components. Ensure you list all relevant components and avoid adding unrelated information:

Claim: {claim} 
Results:
        '''
    elif claim_type == "interval":
        claim =  main_prompt + f'''
Example 15:
Claim:
"An image of a red sky in Beijing was taken on March 23, 2023, during a sandstorm."
Results: 
1) "An image of a red sky in Beijing was taken on March 23, 2023."
2) "There was a sandstorm in Beijing on March 23, 2023."
3) "The image was taken during a sandstorm." 

Example 8:
Claim:
"The stock market crashed in 1929, leading to the Great Depression."
Results:
1) "The stock market crashed in 1929."
2) "The stock market crash led to the Great Depression."
3) "The Great Depression followed the stock market crash of 1929."

---
Now, break down the following interval claim into its smallest, factual components. Ensure you list all relevant components and avoid adding unrelated information:

Claim: {claim} 
Results:
'''  
    elif claim_type == "statistical":
        claim =  main_prompt + f'''

Example 6:
Claim:
"Police-recorded crimes against property in the EU increased in 2022: thefts rose by 17.9%%, robberies by 9.7%% and burglaries by 7.4%% compared with the previous year."
Results:
1) "Police-recorded crimes against property in the EU increased in 2022."
2) "Thefts rose by 17.9%% compared with the previous year"
3) "Robberies rose by 9.7%% compared with the previous year"
4) "Burglaries rose by 7.4%% ccompared with the previous year"

Example 12:
Claim:
"President Bolsonaro is facing criticism for deforestation in the Amazon, which has increased by 25%% since he took office."
Results:
1) "President Bolsonaro is facing criticism for deforestation in the Amazon."
2) "Deforestation in the Amazon has increased by 25%%."
3) "The increase in deforestation is since President Bolsonaro took office."

---
Now, break down the following statistical claim into its smallest, factual components. Ensure you list all relevant components and avoid adding unrelated information:

Claim: {claim} 
Results:
'''
    elif claim_type == "temporal":
        claim =  main_prompt + f'''
Example 5:
Claim:
"In 2005, an estimated 1.5 million people from Alabama, Mississippi, and Louisiana fled their homes in the face of Hurricane Katrina."
Results:
1) "In 2005, Hurricane Katrina struck Alabama, Mississippi, and Louisiana."
2) "At least 1.5 million fled their homes in the face of Hurricane Katrina."
3) "An estimated 1.5 million people who fled their homes were from Alabama, Mississippi, and Louisiana."

Example 3:
Claim:
"The 2022 Winter Olympics in Beijing were the first to feature a unified Korean team."
Results: 
1) "The 2022 Winter Olympics were held in Beijing."
2) "The 2022 Winter Olympics featured a unified Korean team."
3) "The 2022 unified Korean team was the first in Olympic history."

---
Now, break down the following temporal claim into its smallest, factual components. Ensure you list all relevant components and avoid adding unrelated information:

Claim: {claim} 
Results:
'''
    else:
        claim = f"Decompose the following claim: {claim} "
    return claim  

def planner(df):
    # Apply the prompt to each claim and save in separate column
    df['prompt'] = df.progress_apply(lambda x: add_decomp_prompt(x['claim'], x['type']), axis=1)
    return df

train_df = planner(train_df)
val_df = planner(val_df)

print(train_df['prompt'].head()[0])

100%|██████████| 2495/2495 [00:00<00:00, 71287.48it/s]
100%|██████████| 3084/3084 [00:00<00:00, 68504.91it/s]

The task is to break down a given claim into its constituent statements. Here are some examples to illustrate the logic:

Example 1:
Claim:
"A patent for coronavirus was granted in 2018 to the Pirbright Institute UK, founded by Bill and Melinda Gates."
Results: 
1) "A patent for coronavirus was granted in 2018."
2) "The patent was granted to the Pirbright Institute UK."
3) "The Pirbright Institute UK was founded by Bill and Melinda Gates."

Example 2:
Claim:
"A video that went viral in July 2023 authentically depicted an ad in Japan that read "Stop Zelenskyy, Stop War."" 
Results: 
1) "A video showed an ad in Japan that read "Stop Zelenskyy, Stop War.""
2) "The video went viral in July 2023."
3) "The video was authentic."

Example 3:
Claim:
"In the historic 2020 US election, Joe Biden defeated Donald Trump to become the 46th President of the United States."
Results:
1) "The 2020 US election was historic."
2) "Joe Biden defeated Donald Trump in the 2020 US election."
3) "Joe Biden becam




In [3]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from torch.cuda.amp import autocast

train_tokenized = tokenizer(train_df['prompt'].tolist(), padding=True, max_length=1024, truncation=False, return_tensors="pt")
val_tokenized = tokenizer(val_df['prompt'].tolist(), padding=True, max_length=1024, truncation=False, return_tensors="pt")

# Decompose claims after tokenization
def decompose_claims(input_ids, attention_mask, batch_size = 8):
    results = []
    with torch.no_grad():
        for i in tqdm(range(0, input_ids.size(0), batch_size), desc="Generating Decompositions"):
            batch_input_ids = input_ids[i:i+batch_size].pin_memory().to(device, non_blocking=True)
            batch_attention_mask = attention_mask[i:i+batch_size].pin_memory().to(device, non_blocking=True)
            outputs = model.generate(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_mask,
                max_length=512,
                no_repeat_ngram_size=10,
                repetition_penalty=2.5,
                early_stopping=True
            )
            for output in outputs:
                decomposed_claim = tokenizer.decode(output, skip_special_tokens=True)
                results.append(decomposed_claim)
    return results

train_outputs = decompose_claims(train_tokenized.input_ids, train_tokenized.attention_mask)
val_outputs = decompose_claims(val_tokenized.input_ids, val_tokenized.attention_mask)



Generating Decompositions: 100%|██████████| 312/312 [31:40<00:00,  6.09s/it]
Generating Decompositions: 100%|██████████| 386/386 [38:00<00:00,  5.91s/it]


In [4]:

# Process the data to remove claim numbers


# Save results to CSV
train_outputs = pd.DataFrame(train_outputs, columns=["decomposed_claim"])
val_outputs = pd.DataFrame(val_outputs, columns=["decomposed_claim"])
train_outputs.to_csv("train_decompositions.csv", index=False)
val_outputs.to_csv("val_decompositions.csv", index=False)

# Print sample output
print(train_df.head())
print(val_df.head())

          type                                              claim  \
0  statistical  "The non-partisan Congressional Budget Office ...   
1  statistical  "More than 50 percent of immigrants from (El S...   
2     temporal  UK government banned Covid vaccine for childre...   
3  statistical  "[In 2014-2015] coverage for the rotavirus vac...   
4     temporal  In September 2021, the U.K. government announc...   

                                              prompt  
0  The task is to break down a given claim into i...  
1  The task is to break down a given claim into i...  
2  The task is to break down a given claim into i...  
3  The task is to break down a given claim into i...  
4  The task is to break down a given claim into i...  
          type                                              claim  \
0     interval  Amit Shah said Narendra Modi sleeps for 24 hou...   
1     temporal  Video of show Pakistani players celebrating th...   
2  statistical  Says Dino Rossi "stripped" healt