In [43]:
from ollama import chat
import pandas as pd
import numpy as np
import ast
import json
from ollama import ChatResponse
from pydantic import BaseModel


In [13]:
sem_eval_root = 'SemEval_Task7'
sem_eval_test_root = 'SemEval_Task7_Test_Phase'

In [14]:
pairs_df = pd.read_csv(f'{sem_eval_root}/pairs.csv')
fact_checks_df = pd.read_csv(f'{sem_eval_root}/fact_checks.csv')
posts_df = pd.read_csv(f'{sem_eval_root}/posts.csv')

In [16]:
test_fact_checks_df = pd.read_csv(f'{sem_eval_test_root}/fact_checks.csv')
test_posts_df = pd.read_csv(f'{sem_eval_test_root}/posts.csv')


with open(f'{sem_eval_test_root}/crosslingual_predictions.json', 'r') as crosslingual_file:
  crosslingual_data = json.load(crosslingual_file)

crosslingual_prediction_post_id = list(map(lambda x: int(x), crosslingual_data))

In [17]:
def get_translated_claim(tuple_string):
  return ast.literal_eval(tuple_string)[1]

def get_translated_claim_lang(tuple_string):
  if isinstance(ast.literal_eval(tuple_string)[-1][0], str):
    return ast.literal_eval(tuple_string)[-1][0]
  else:
    return ast.literal_eval(tuple_string)[-1][0][0]

def get_translated_title(tuple_string):
  if pd.isnull(tuple_string):
    return ""
  return ast.literal_eval(tuple_string)[1]


def get_translated_title_lang(tuple_string):
  if pd.isnull(tuple_string):
    return ""
  if isinstance(ast.literal_eval(tuple_string)[-1][0], str):
    return ast.literal_eval(tuple_string)[-1][0]
  else:
    return ast.literal_eval(tuple_string)[-1][0][0]

def get_translated_ocr(tuple_string):
  if pd.isnull(tuple_string):
    return ""
  eval = ast.literal_eval(tuple_string)
  if not eval:
    return ""
  return eval[0][1]

def get_translated_ocr_lang(tuple_string):
  if pd.isnull(tuple_string):
    return ""
  eval = ast.literal_eval(tuple_string)
  if not eval:
    return ""
  return eval[0][-1][0][0]

def get_translated_text(tuple_string):
  if pd.isnull(tuple_string):
    return ""
  eval = ast.literal_eval(tuple_string)
  if not eval:
    return ""
  return eval[1]

def get_translated_text_lang(tuple_string):
  if pd.isnull(tuple_string):
    return ""
  eval = ast.literal_eval(tuple_string)
  if not eval:
    return ""
  return eval[-1][0][0]

In [18]:
claim_translated = fact_checks_df['claim'].apply(get_translated_claim)
claim_lang = fact_checks_df['claim'].apply(get_translated_claim_lang)


fact_checks_df['claim_translated'] = claim_translated
fact_checks_df['claim_lang'] = claim_lang

In [19]:
title_translated = fact_checks_df['title'].apply(get_translated_title)
title_lang = fact_checks_df['title'].apply(get_translated_title_lang)

fact_checks_df['title_translated'] = title_translated
fact_checks_df['title_lang'] = title_lang

final_fact_checks_df = fact_checks_df.drop(columns=['claim', 'title', 'instances'])
final_fact_checks_df.tail()

Unnamed: 0,fact_check_id,claim_translated,claim_lang,title_translated,title_lang
153738,205744,"🇫🇷 In France, the military and civilian police...",ara,This video is not for the French police's acti...,ara
153739,205745,👆This little beautiful girl was seen in Mangal...,eng,A child rescued from a group of beggars in Man...,tam
153740,205747,"📌 Italians and foreigners, men and women, chil...",ara,These photos of a queue in front of a food aid...,ara
153741,205749,🔵Confirmed... Tomorrow the free messages will ...,por,WhatsApp will charge 0.37 cents per message st...,por
153742,205750,🕋🗃 WE OPEN THE BLACK BOX OF BNDES.......\n✅ DU...,por,Post mixes true data with incorrect numbers ab...,por


In [20]:
claim_translated = test_fact_checks_df['claim'].apply(get_translated_claim)
claim_lang = test_fact_checks_df['claim'].apply(get_translated_claim_lang)


test_fact_checks_df['claim_translated'] = claim_translated
test_fact_checks_df['claim_lang'] = claim_lang


title_translated = test_fact_checks_df['title'].apply(get_translated_title)
title_lang = test_fact_checks_df['title'].apply(get_translated_title_lang)

test_fact_checks_df['title_translated'] = title_translated
test_fact_checks_df['title_lang'] = title_lang

final_test_fact_checks_df = test_fact_checks_df.drop(columns=['claim', 'title', 'instances'])
final_test_fact_checks_df.tail()

Unnamed: 0,fact_check_id,claim_translated,claim_lang,title_translated,title_lang
272442,372883,Claudia Sheinbaum wants to change the Constitu...,spa,France Media Agency ...,fra
272443,372884,Israeli Ambassador raises poster commemorating...,por,France Media Agency ...,fra
272444,372891,The viral video shows African protesting at Qu...,eng,Fact Check: Viral Video Does NOT Show African ...,eng
272445,372893,Do you get notifications when you take a scree...,tur,Do you get notifications when you take a scree...,tur
272446,372897,Electric cars pollute more and catch fire more...,spa,They neither pollute more nor catch fire more ...,spa


In [21]:
ocr_translated = posts_df['ocr'].apply(get_translated_ocr)
ocr_lang = posts_df['ocr'].apply(get_translated_ocr_lang)
posts_df["ocr_translated"]  = ocr_translated
posts_df["ocr_lang"]  = ocr_lang

text_translated = posts_df['text'].apply(get_translated_text)
text_lang = posts_df['text'].apply(get_translated_text_lang)
posts_df["text_translated"]  = text_translated
posts_df["text_lang"]  = text_lang

final_posts_df = posts_df.drop(columns=["instances", "ocr", "verdicts", "text"])
final_posts_df.head()

Unnamed: 0,post_id,ocr_translated,ocr_lang,text_translated,text_lang
0,0,! Brazen vaccination fake by Markus Söder! It'...,deu,,
1,1,!! WARNING !! A new thing circulating now. Peo...,eng,,
2,2,"""Actually, he's a damn sight better than any o...",eng,,
3,3,"""Australia 50 MILLION doses of ""vacuna"" retira...",fra,,
4,4,"""Blessed are those persecuted by me cause ""The...",spa,,


In [22]:
ocr_translated = test_posts_df['ocr'].apply(get_translated_ocr)
ocr_lang = test_posts_df['ocr'].apply(get_translated_ocr_lang)
test_posts_df["ocr_translated"]  = ocr_translated
test_posts_df["ocr_lang"]  = ocr_lang

text_translated = test_posts_df['text'].apply(get_translated_text)
text_lang = test_posts_df['text'].apply(get_translated_text_lang)
test_posts_df["text_translated"]  = text_translated
test_posts_df["text_lang"]  = text_lang

final_test_posts_df = posts_df.drop(columns=["instances", "ocr", "verdicts", "text"])
final_test_posts_df.tail()

Unnamed: 0,post_id,ocr_translated,ocr_lang,text_translated,text_lang
24426,28085,,,🧐 The president separated... the constitution ...,spa
24427,28087,bruising runny ed 1 e Contents of the pack and...,eng,🧐🧐🧐,eng
24428,28089,,,"🧬Robert Malone, inventor of mRNA technology, s...",fra
24429,28090,Number of Covid-19 Deaths 4500 4000 3500 3000 ...,eng,🧵While you are distracted by the invasion of R...,por
24430,28091,Popularity and foot If you respond in a profes...,kor,"🧿 The president is an avatar... ""President, if...",kor


In [23]:
joined_with_post = pd.merge(pairs_df, final_posts_df, on='post_id', how='inner')
joined_with_post.head()

Unnamed: 0,post_id,fact_check_id,ocr_translated,ocr_lang,text_translated,text_lang
0,2228,33,"WHY DO WE NEED A $4 TRILLION JOBS PLAN, WHEN A...",eng,,
1,2228,23568,"WHY DO WE NEED A $4 TRILLION JOBS PLAN, WHEN A...",eng,,
2,2228,194577,"WHY DO WE NEED A $4 TRILLION JOBS PLAN, WHEN A...",eng,,
3,2229,33,"WHY DO WE NEED A $4 TRILLION JOBS PLAN, WHEN A...",eng,,
4,2229,23568,"WHY DO WE NEED A $4 TRILLION JOBS PLAN, WHEN A...",eng,,


In [26]:
final_df = pd.merge(joined_with_post, final_fact_checks_df, on='fact_check_id', how='inner')
final_df.head()

Unnamed: 0,post_id,fact_check_id,ocr_translated,ocr_lang,text_translated,text_lang,claim_translated,claim_lang,title_translated,title_lang
0,2228,33,"WHY DO WE NEED A $4 TRILLION JOBS PLAN, WHEN A...",eng,,,"""$4 trillion jobs plan"" unnecessary because 20...",eng,Posts on Biden jobs plan falsely claim 2020 un...,eng
1,2228,23568,"WHY DO WE NEED A $4 TRILLION JOBS PLAN, WHEN A...",eng,,,America had the lowest unemployment rate in hi...,eng,Fact check: Unemployment rate hit historic hig...,eng
2,2228,194577,"WHY DO WE NEED A $4 TRILLION JOBS PLAN, WHEN A...",eng,,,“A year ago we had the lowest unemployment in ...,eng,Unemployment was the lowest ever a year ago? N...,eng
3,2229,33,"WHY DO WE NEED A $4 TRILLION JOBS PLAN, WHEN A...",eng,,,"""$4 trillion jobs plan"" unnecessary because 20...",eng,Posts on Biden jobs plan falsely claim 2020 un...,eng
4,2229,23568,"WHY DO WE NEED A $4 TRILLION JOBS PLAN, WHEN A...",eng,,,America had the lowest unemployment rate in hi...,eng,Fact check: Unemployment rate hit historic hig...,eng


In [27]:
ocr_text = final_df['ocr_translated'].apply(lambda x: f"OCR Text: '{x}'")
text = final_df['text_translated'].apply(lambda x: f"Social Media Caption: '{x}'")
claim = final_df['claim_translated'].apply(lambda x: f"Output: '{x}'")

In [29]:
concat_col = ocr_text.str.cat(text, sep=', ').str.cat(claim, sep='\n ')
examples = concat_col[:20].str.cat(sep="\n\n\n")

In [44]:
class ClaimGeneratorResponse(BaseModel):
    claim: str

In [None]:
post_id_generated_claim = {}

i = 0
for post_id in crosslingual_prediction_post_id:
  ocr = test_posts_df[test_posts_df['post_id'] == post_id]["ocr_translated"].iloc[0]
  if pd.isna(ocr):
    ocr = ""

  text = test_posts_df[test_posts_df['post_id'] == post_id]["text_translated"].iloc[0]
  if pd.isna(text):
    text = ""

    

  cur_claim_prompt  = f"""
Task: Generate a concise and accurate claim made by a social media post.

Input:

    OCR Text: {ocr}
    Social Media Caption: {text}

Output:
JSON

{{
  "claim": [The claim made in the social media post, based on both the image text and the caption]
}}

Guidelines:

    The claim should be stated objectively and avoid any subjective language or interpretation.
    If the post makes multiple claims, focus on the most prominent or overarching one but also make sure other claims are also mentioned.
    If the post does not make a clear claim, but rather expresses an opinion or sentiment, rephrase it as a statement about the author's perspective.
    Ensure the claim is grammatically correct and free of spelling errors.

Example:

    OCR Text: "Study shows 97% of scientists agree climate change is real."
    Social Media Caption: "The science is clear! We need to act on climate change now."
    Output:

JSON

{{
  "claim": "97% of scientists agree climate change is real." 
}}
"""
  response: ChatResponse = chat(model='llama3.2:3b', messages=[
  {
    'role': 'user',
    'content': cur_claim_prompt,
  },
], format=ClaimGeneratorResponse.model_json_schema(),
options={"temperature": 0.4})
  claim = ClaimGeneratorResponse.model_validate_json(response.message.content)
  post_id_generated_claim[post_id] = claim.claim 
  print(f"{i+1} out of {len(crosslingual_prediction_post_id)}")
  i+= 1
 

# Write the dictionary to a JSON file
with open("claims.json", "w") as f:
    json.dump(post_id_generated_claim, f)

1 out of 4000
2 out of 4000
3 out of 4000
4 out of 4000
