In [1]:
# %pip install --upgrade numpy pandas matplotlib sentence-transformers torch tqdm

import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Tuple, Iterator
import ast
import numpy as np
# from sentence_transformers import SentenceTransformer
import torch
import gc
from tqdm import tqdm



In [2]:
import ast
import os

import pandas as pd

our_dataset_path = '/home/csgrads/syed0093/SemEval_Task7/Task_Data/'

posts_path = os.path.join(our_dataset_path, 'posts.csv')
fact_checks_path = os.path.join(our_dataset_path, 'fact_checks.csv')
fact_check_post_mapping_path = os.path.join(our_dataset_path, 'pairs.csv')

for path in [posts_path, fact_checks_path, fact_check_post_mapping_path]:
    if not os.path.isfile(path):
        raise FileNotFoundError(f"File not found: {path}")
    
parse_col = lambda s: ast.literal_eval(s.replace('\n', '\\n')) if s else s

df_fact_checks = pd.read_csv(fact_checks_path).fillna('').set_index('fact_check_id')
for col in ['claim', 'instances', 'title']:
    df_fact_checks[col] = df_fact_checks[col].apply(parse_col)


df_posts = pd.read_csv(posts_path).fillna('').set_index('post_id')
for col in ['instances', 'ocr', 'verdicts', 'text']:
    df_posts[col] = df_posts[col].apply(parse_col)


df_fact_check_post_mapping = pd.read_csv(fact_check_post_mapping_path) 

In [3]:
# # Save the filtered DataFrame to a new CSV file
# df_fact_checks.to_csv('processed_fact_checks.csv', index=False)
# df_fact_checks.head()


In [14]:
# df_posts.to_csv('processed_posts.csv', index=False)
# df_posts.head()


Unnamed: 0_level_0,instances,ocr,verdicts,text
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[(1608571882.0, fb)]",[(! Dreister Impf-Fake von Markus Söder! Es is...,[False information],
1,"[(1586139153.0, fb)]",[(!! WARNING !! A new thing circulating now. P...,[False information],
2,"[(1610052141.0, fb), (1610072448.0, fb)]","[(""Actually, he's a damn sight better than any...",[Missing context],
3,"[(1645187790.0, ig)]","[(""Australia 50 MILLONES de dosis de ""vacuna"" ...",[False],
4,"[(1581697500.0, fb)]","[(""Bienaventurados los perseguidos por mi caus...",[],


In [15]:
# df_fact_check_post_mapping.to_csv('processed_pairs.csv', index=False)
# df_fact_check_post_mapping.head()

Unnamed: 0,post_id,fact_check_id
0,2228,33
1,2228,23568
2,2228,194577
3,2229,33
4,2229,23568


### TFIDF

In [3]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast

def parse_text_tuple(text_str):
    """Parse the tuple string format and extract text content."""
    try:
        # Convert string representation of tuple to actual tuple
        data = ast.literal_eval(text_str)
        if isinstance(data, tuple) and len(data) >= 2:
            # Return both original and translated text if available
            return ' '.join([str(data[0]), str(data[1])])
        return str(data[0])
    except:
        return text_str

In [4]:
def parse_instances(instances_str):
    """Parse the instances string to extract URLs and timestamps."""
    try:
        data = ast.literal_eval(instances_str)
        return [item[1] if isinstance(item, tuple) and len(item) > 1 else str(item) 
                for item in data]
    except:
        return []

In [5]:
def preprocess_post(row):
    """Combine relevant text fields from a post."""
    texts = []
    
    # Process text field
    if pd.notna(row.get('text')):
        try:
            text_data = ast.literal_eval(row['text'])
            if isinstance(text_data, list):
                for item in text_data:
                    if isinstance(item, tuple) and len(item) > 0:
                        texts.append(str(item[0]))  # Original text
            else:
                texts.append(str(text_data))
        except:
            texts.append(str(row['text']))
    
    return ' '.join(texts)

In [6]:
def create_retrieval_system(fact_checks_df, posts_df, task_config, language):
    """Create and train the retrieval system for a specific language."""
    # Filter fact checks for the specified language
    valid_fact_check_ids = task_config['monolingual'][language]['fact_checks']
    fact_checks = fact_checks_df[fact_checks_df['fact_check_id'].isin(valid_fact_check_ids)]
    
    # Prepare fact check texts
    fact_check_texts = []
    for _, row in fact_checks.iterrows():
        texts = []
        if pd.notna(row['claim']):
            texts.append(parse_text_tuple(row['claim']))
        if pd.notna(row['title']):
            texts.append(parse_text_tuple(row['title']))
        fact_check_texts.append(' '.join(texts))
    
    # Create TF-IDF vectors for fact checks
    vectorizer = TfidfVectorizer(max_features=5000)
    fact_check_vectors = vectorizer.fit_transform(fact_check_texts)
    
    return vectorizer, fact_check_vectors, fact_checks['fact_check_id'].tolist()

In [7]:
def retrieve_fact_checks(post_text, vectorizer, fact_check_vectors, fact_check_ids, top_k=10):
    """Retrieve the most relevant fact checks for a given post."""
    post_vector = vectorizer.transform([post_text])
    similarities = cosine_similarity(post_vector, fact_check_vectors).flatten()
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    return [fact_check_ids[i] for i in top_indices]

In [8]:
def generate_predictions(posts_df, task_config, vectorizer, fact_check_vectors, 
                        fact_check_ids, language, split='posts_dev'):
    """Generate predictions for the development set."""
    predictions = {}
    valid_post_ids = task_config['monolingual'][language][split]
    
    for post_id in valid_post_ids:
        post = posts_df[posts_df['post_id'] == post_id].iloc[0]
        post_text = preprocess_post(post)
        retrieved_fact_checks = retrieve_fact_checks(
            post_text, vectorizer, fact_check_vectors, fact_check_ids
        )
        predictions[str(post_id)] = retrieved_fact_checks
    
    return predictions

In [None]:
def main():
    # Load data
    fact_checks = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/fact_checks.csv')
    posts = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/posts.csv')
    with open('tasks.json') as f:
        tasks = json.load(f)
    
    # Process for each language
    all_predictions = {}
    for language in tasks['monolingual'].keys():
        vectorizer, fact_check_vectors, fact_check_ids = create_retrieval_system(
            fact_checks, posts, tasks, language
        )
        
        predictions = generate_predictions(
            posts, tasks, vectorizer, fact_check_vectors, fact_check_ids, language
        )
        all_predictions.update(predictions)
    
    # Save predictions
    with open('monolingual_predictions_tfidf.json', 'w') as f:
        json.dump(all_predictions, f)

if __name__ == "__main__":
    main()

## 1. BERT

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import json
import ast
from tqdm import tqdm

class BERTRetriever:
    def __init__(self, model_name='xlm-roberta-base', max_length=512):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        self.max_length = max_length

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def get_embeddings(self, texts):
        embeddings = []
        
        for text in tqdm(texts, desc="Generating embeddings"):
            encoded = self.tokenizer(text, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=self.max_length, 
                                   return_tensors='pt')
            
            encoded = {k: v.to(self.device) for k, v in encoded.items()}
            
            with torch.no_grad():
                model_output = self.model(**encoded)
            
            sentence_embedding = self.mean_pooling(model_output, encoded['attention_mask'])
            embeddings.append(sentence_embedding.cpu().numpy()[0])
        
        return np.array(embeddings)

def parse_text_tuple(text_str):
    try:
        data = ast.literal_eval(text_str)
        if isinstance(data, tuple) and len(data) >= 2:
            return ' '.join([str(data[0]), str(data[1])])
        return str(data[0])
    except:
        return text_str

def create_retrieval_system(fact_checks_df, posts_df, task_config, language):
    retriever = BERTRetriever()
    
    # Filter fact checks
    valid_fact_check_ids = task_config['monolingual'][language]['fact_checks']
    fact_checks = fact_checks_df[fact_checks_df['fact_check_id'].isin(valid_fact_check_ids)]
    
    # Prepare fact check texts
    fact_check_texts = []
    for _, row in fact_checks.iterrows():
        texts = []
        if pd.notna(row['claim']):
            texts.append(parse_text_tuple(row['claim']))
        if pd.notna(row['title']):
            texts.append(parse_text_tuple(row['title']))
        fact_check_texts.append(' '.join(texts))
    
    # Generate embeddings
    fact_check_vectors = retriever.get_embeddings(fact_check_texts)
    
    return retriever, fact_check_vectors, fact_checks['fact_check_id'].tolist()

def retrieve_fact_checks(post_text, retriever, fact_check_vectors, fact_check_ids, top_k=10):
    post_vector = retriever.get_embeddings([post_text])
    similarities = cosine_similarity(post_vector, fact_check_vectors).flatten()
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    return [fact_check_ids[i] for i in top_indices]

def preprocess_post(row):
    texts = []
    if pd.notna(row.get('text')):
        try:
            text_data = ast.literal_eval(row['text'])
            if isinstance(text_data, list):
                for item in text_data:
                    if isinstance(item, tuple) and len(item) > 0:
                        texts.append(str(item[0]))
            else:
                texts.append(str(text_data))
        except:
            texts.append(str(row['text']))
    return ' '.join(texts)

def main():
    # Load data
    fact_checks = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/fact_checks.csv')
    posts = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/posts.csv')
    with open('tasks.json') as f:
        tasks = json.load(f)
    
    all_predictions = {}
    for language in tasks['monolingual'].keys():
        print(f"\nProcessing language: {language}")
        
        retriever, fact_check_vectors, fact_check_ids = create_retrieval_system(
            fact_checks, posts, tasks, language
        )
        
        valid_post_ids = tasks['monolingual'][language]['posts_dev']
        for post_id in tqdm(valid_post_ids, desc="Generating predictions"):
            post = posts[posts['post_id'] == post_id].iloc[0]
            post_text = preprocess_post(post)
            retrieved_fact_checks = retrieve_fact_checks(
                post_text, retriever, fact_check_vectors, fact_check_ids
            )
            all_predictions[str(post_id)] = retrieved_fact_checks
    
    # Save predictions
    with open('monolingual_predictions_bert_1.json', 'w') as f:
        json.dump(all_predictions, f)

if __name__ == "__main__":
    main()
    
# Runtime: 15 minutes

## BERT 2

In [None]:
class BERTRetriever:
    def __init__(self, model_name='xlm-roberta-base'):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)

    def get_embeddings(self, texts):
        embeddings = []
        batch_size = 8
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            encoded = self.tokenizer(batch, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=512, 
                                   return_tensors='pt')
            
            encoded = {k: v.to(self.device) for k, v in encoded.items()}
            
            with torch.no_grad():
                outputs = self.model(**encoded)
                # Use CLS token embedding
                batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                embeddings.extend(batch_embeddings)
        
        return np.array(embeddings)

def process_text(text_str):
    if pd.isna(text_str):
        return ""
    try:
        data = ast.literal_eval(text_str)
        if isinstance(data, tuple):
            return str(data[0])  # Use original text
        elif isinstance(data, list):
            return ' '.join(str(item[0]) if isinstance(item, tuple) else str(item) for item in data)
        return str(data)
    except:
        return str(text_str)

def main():
    print("Loading data...")
    fact_checks = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/fact_checks.csv')
    posts = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/posts.csv')
    pairs = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/pairs.csv')  # Load gold pairs
    with open('tasks.json') as f:
        tasks = json.load(f)
    
    retriever = BERTRetriever()
    all_predictions = {}
    
    for language in tasks['monolingual'].keys():
        print(f"\nProcessing {language}")
        
        # Filter fact checks for language
        valid_fact_checks = tasks['monolingual'][language]['fact_checks']
        language_fact_checks = fact_checks[fact_checks['fact_check_id'].isin(valid_fact_checks)]
        
        # Prepare fact check texts
        fact_check_texts = []
        for _, row in language_fact_checks.iterrows():
            text = process_text(row['claim']) + " " + process_text(row['title'])
            fact_check_texts.append(text)
        
        print("Generating fact check embeddings...")
        fact_check_vectors = retriever.get_embeddings(fact_check_texts)
        fact_check_ids = language_fact_checks['fact_check_id'].tolist()
        
        # Process dev posts
        dev_post_ids = tasks['monolingual'][language]['posts_dev']
        dev_posts = posts[posts['post_id'].isin(dev_post_ids)]
        
        print("Processing posts...")
        for _, post in tqdm(dev_posts.iterrows()):
            post_text = process_text(post['text'])
            post_vector = retriever.get_embeddings([post_text])
            
            # Calculate similarities and get top matches
            similarities = cosine_similarity(post_vector, fact_check_vectors).flatten()
            top_indices = np.argsort(similarities)[-10:][::-1]
            
            # Store predictions with correct ID type
            all_predictions[str(int(post['post_id']))] = [str(fact_check_ids[i]) for i in top_indices]
    
    print("\nSaving predictions...")
    with open('monolingual_predictions_bert_2.json', 'w') as f:
        json.dump(all_predictions, f)

if __name__ == "__main__":
    main()
    
# Runtime: 5 minutes

Loading data...

Processing fra
Generating fact check embeddings...
Processing posts...


188it [00:03, 52.05it/s]



Processing spa
Generating fact check embeddings...
Processing posts...


615it [00:21, 27.96it/s]



Processing eng
Generating fact check embeddings...
Processing posts...


478it [01:10,  6.83it/s]



Processing por
Generating fact check embeddings...
Processing posts...


302it [00:14, 20.67it/s]



Processing tha
Generating fact check embeddings...
Processing posts...


42it [00:00, 109.74it/s]



Processing deu
Generating fact check embeddings...
Processing posts...


83it [00:01, 55.61it/s]



Processing msa
Generating fact check embeddings...
Processing posts...


105it [00:02, 46.53it/s]



Processing ara
Generating fact check embeddings...
Processing posts...


78it [00:02, 28.79it/s]



Saving predictions...


### Infloat E5 multilingual large

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import json
from tqdm import tqdm
import numpy as np

class E5Retriever:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
        self.model = AutoModel.from_pretrained('intfloat/multilingual-e5-large').to(self.device)
    
    def average_pool(self, last_hidden_states, attention_mask):
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    
    def get_embeddings(self, texts, batch_size=8):
        embeddings = []
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            batch_dict = self.tokenizer(batch, max_length=512, padding=True, 
                                      truncation=True, return_tensors='pt')
            batch_dict = {k: v.to(self.device) for k, v in batch_dict.items()}
            
            with torch.no_grad():
                outputs = self.model(**batch_dict)
            emb = self.average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
            embeddings.extend(F.normalize(emb, p=2, dim=1).cpu().numpy())
            
        return np.array(embeddings)

def format_text(row, type='fact_check'):
    if type == 'fact_check':
        claim = row.get('claim', '')
        title = row.get('title', '')
        return f"passage: {claim} {title}"
    else:
        return f"query: {row.get('text', '')}"

def main():
    retriever = E5Retriever()
    fact_checks = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/fact_checks.csv')
    posts = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/posts.csv')
    
    with open('tasks.json') as f:
        tasks = json.load(f)
    
    predictions = {}
    
    for language in tasks['monolingual'].keys():
        valid_fact_checks = tasks['monolingual'][language]['fact_checks']
        language_fact_checks = fact_checks[fact_checks['fact_check_id'].isin(valid_fact_checks)]
        
        fact_check_texts = [format_text(row) for _, row in language_fact_checks.iterrows()]
        fact_check_vectors = retriever.get_embeddings(fact_check_texts)
        fact_check_ids = language_fact_checks['fact_check_id'].tolist()
        
        dev_post_ids = tasks['monolingual'][language]['posts_dev']
        dev_posts = posts[posts['post_id'].isin(dev_post_ids)]
        
        for _, post in tqdm(dev_posts.iterrows(), desc=f"Processing {language}"):
            post_text = format_text(post, type='post')
            post_vector = retriever.get_embeddings([post_text])
            
            scores = (post_vector @ fact_check_vectors.T)[0]
            top_indices = np.argsort(scores)[-10:][::-1]
            
            predictions[str(int(post['post_id']))] = [str(fact_check_ids[i]) for i in top_indices]
    
    with open('monolingual_predictions_e5_large.json', 'w') as f:
        json.dump(predictions, f)

if __name__ == "__main__":
    main()
    
# Runtime: 22 minutes

Processing fra: 188it [00:03, 60.03it/s]
Processing spa: 615it [00:09, 64.46it/s]
Processing eng: 478it [00:10, 46.29it/s]
Processing por: 302it [00:04, 63.50it/s]
Processing tha: 42it [00:00, 46.30it/s]
Processing deu: 83it [00:01, 56.72it/s]
Processing msa: 105it [00:01, 65.34it/s]
Processing ara: 78it [00:01, 62.10it/s]


### FASTTEXT

In [10]:
! pip install fasttext

Defaulting to user installation because normal site-packages is not writeable
Collecting fasttext
  Using cached fasttext-0.9.3-cp310-cp310-linux_x86_64.whl
Collecting pybind11>=2.2
  Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.3 pybind11-2.13.6


In [None]:
import fasttext
import fasttext.util
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
from sklearn.metrics.pairwise import cosine_similarity
import re
import os

class FastTextRetriever:
    def __init__(self, model_name='cc.en.300.bin'):
        """Initialize FastText retriever with specified model."""
        # Download model if not exists
        if not os.path.exists(model_name):
            fasttext.util.download_model('en', if_exists='ignore')
        self.model = fasttext.load_model(model_name)
    
    def preprocess_text(self, text):
        """Clean and preprocess text."""
        if pd.isna(text):
            return ""
        # Convert to string
        text = str(text)
        # Remove special characters
        text = re.sub(r'[^\w\s]', ' ', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text.lower()
    
    def get_embedding(self, text):
        """Get FastText embedding for a single text."""
        text = self.preprocess_text(text)
        if not text:
            return np.zeros(self.model.get_dimension())
        return self.model.get_sentence_vector(text)
    
    def get_embeddings(self, texts, batch_size=32):
        """Get FastText embeddings for a batch of texts."""
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            batch_embeddings = [self.get_embedding(text) for text in batch]
            embeddings.extend(batch_embeddings)
        return np.array(embeddings)

def process_fact_check(row):
    """Process fact check text from row."""
    try:
        claim = str(row.get('claim', ''))
        title = str(row.get('title', ''))
        return f"{claim} {title}"
    except:
        return ""

def process_post(row):
    """Process post text from row."""
    try:
        return str(row.get('text', ''))
    except:
        return ""

def main():
    print("Loading FastText retriever...")
    retriever = FastTextRetriever()
    
    print("Loading datasets...")
    fact_checks = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/fact_checks.csv')
    posts = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/posts.csv')
    
    with open('tasks.json') as f:
        tasks = json.load(f)
    
    all_predictions = {}
    
    for language in tasks['monolingual'].keys():
        print(f"\nProcessing {language}")
        
        # Filter fact checks for language
        valid_fact_checks = tasks['monolingual'][language]['fact_checks']
        language_fact_checks = fact_checks[fact_checks['fact_check_id'].isin(valid_fact_checks)]
        
        # Get fact check embeddings
        print("Generating fact check embeddings...")
        fact_check_texts = [process_fact_check(row) for _, row in language_fact_checks.iterrows()]
        fact_check_vectors = retriever.get_embeddings(fact_check_texts)
        fact_check_ids = language_fact_checks['fact_check_id'].tolist()
        
        # Process dev posts
        dev_post_ids = tasks['monolingual'][language]['posts_dev']
        dev_posts = posts[posts['post_id'].isin(dev_post_ids)]
        
        print("Processing posts...")
        for _, post in tqdm(dev_posts.iterrows()):
            post_text = process_post(post)
            post_vector = retriever.get_embeddings([post_text])
            
            # Calculate similarities and get top matches
            similarities = cosine_similarity(post_vector, fact_check_vectors).flatten()
            top_indices = np.argsort(similarities)[-10:][::-1]
            
            # Store predictions
            post_id = str(int(post['post_id']))
            predictions = [str(fact_check_ids[i]) for i in top_indices]
            all_predictions[post_id] = predictions
    
    print("\nSaving predictions...")
    os.makedirs('predictions', exist_ok=True)
    with open('monolingual_predictions_fasttext.json', 'w') as f:
        json.dump(all_predictions, f)

if __name__ == "__main__":
    main()
    
# Runtime: 2 minutes

Loading FastText retriever...
Loading datasets...

Processing fra
Generating fact check embeddings...
Processing posts...


188it [00:00, 215.06it/s]



Processing spa
Generating fact check embeddings...
Processing posts...


615it [00:08, 71.34it/s]



Processing eng
Generating fact check embeddings...
Processing posts...


478it [00:31, 15.37it/s]



Processing por
Generating fact check embeddings...
Processing posts...


302it [00:03, 85.91it/s]



Processing tha
Generating fact check embeddings...
Processing posts...


42it [00:00, 475.04it/s]



Processing deu
Generating fact check embeddings...
Processing posts...


83it [00:00, 181.65it/s]



Processing msa
Generating fact check embeddings...
Processing posts...


105it [00:00, 114.54it/s]



Processing ara
Generating fact check embeddings...
Processing posts...


78it [00:00, 93.17it/s]



Saving predictions...


### T5 Model

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
from sklearn.metrics.pairwise import cosine_similarity
import re
import os
from sentence_transformers import SentenceTransformer

class GTRRetriever:
    def __init__(self, model_name='sentence-transformers/gtr-t5-large'):
        """Initialize GTR-T5 retriever with specified model."""
        self.model = SentenceTransformer(model_name)
    
    def preprocess_text(self, text):
        """Clean and preprocess text."""
        if pd.isna(text):
            return ""
        # Convert to string
        text = str(text)
        # Remove special characters
        text = re.sub(r'[^\w\s]', ' ', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text.lower()
    
    def get_embedding(self, text):
        """Get GTR-T5 embedding for a single text."""
        text = self.preprocess_text(text)
        if not text:
            return np.zeros(768)  # GTR-T5-Large has 768 dimensions
        return self.model.encode(text)
    
    def get_embeddings(self, texts, batch_size=32):
        """Get GTR-T5 embeddings for a batch of texts."""
        # Clean texts
        processed_texts = [self.preprocess_text(text) for text in texts]
        # Replace empty texts with a space to avoid errors
        processed_texts = [text if text else " " for text in processed_texts]
        # Use model's built-in batching
        return self.model.encode(processed_texts, batch_size=batch_size)

def process_fact_check(row):
    """Process fact check text from row."""
    try:
        claim = str(row.get('claim', ''))
        title = str(row.get('title', ''))
        return f"{claim} {title}"
    except:
        return ""

def process_post(row):
    """Process post text from row."""
    try:
        return str(row.get('text', ''))
    except:
        return ""

def main():
    print("Loading GTR-T5 retriever...")
    retriever = GTRRetriever()
    
    print("Loading datasets...")
    fact_checks = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/fact_checks.csv')
    posts = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/posts.csv')
    
    with open('tasks.json') as f:
        tasks = json.load(f)
    
    all_predictions = {}
    
    for language in tasks['monolingual'].keys():
        print(f"\nProcessing {language}")
        
        # Filter fact checks for language
        valid_fact_checks = tasks['monolingual'][language]['fact_checks']
        language_fact_checks = fact_checks[fact_checks['fact_check_id'].isin(valid_fact_checks)]
        
        # Get fact check embeddings
        print("Generating fact check embeddings...")
        fact_check_texts = [process_fact_check(row) for _, row in language_fact_checks.iterrows()]
        fact_check_vectors = retriever.get_embeddings(fact_check_texts)
        fact_check_ids = language_fact_checks['fact_check_id'].tolist()
        
        # Process dev posts
        dev_post_ids = tasks['monolingual'][language]['posts_dev']
        dev_posts = posts[posts['post_id'].isin(dev_post_ids)]
        
        print("Processing posts...")
        for _, post in tqdm(dev_posts.iterrows()):
            post_text = process_post(post)
            post_vector = retriever.get_embeddings([post_text])
            
            # Calculate similarities and get top matches
            similarities = cosine_similarity(post_vector, fact_check_vectors).flatten()
            top_indices = np.argsort(similarities)[-10:][::-1]
            
            # Store predictions
            post_id = str(int(post['post_id']))
            predictions = [str(fact_check_ids[i]) for i in top_indices]
            all_predictions[post_id] = predictions
    
    print("\nSaving predictions...")
    os.makedirs('predictions', exist_ok=True)
    with open('predictions/monolingual_predictions_gtr.json', 'w') as f:
        json.dump(all_predictions, f)

if __name__ == "__main__":
    main()
    
# Runtime: 19 minutes

Loading GTR-T5 retriever...
Loading datasets...

Processing fra
Generating fact check embeddings...
Processing posts...


188it [00:06, 30.10it/s]



Processing spa
Generating fact check embeddings...
Processing posts...


615it [00:23, 25.78it/s]



Processing eng
Generating fact check embeddings...
Processing posts...


478it [01:19,  6.04it/s]



Processing por
Generating fact check embeddings...
Processing posts...


302it [00:14, 21.15it/s]



Processing tha
Generating fact check embeddings...
Processing posts...


42it [00:01, 38.31it/s]



Processing deu
Generating fact check embeddings...
Processing posts...


83it [00:02, 31.84it/s]



Processing msa
Generating fact check embeddings...
Processing posts...


105it [00:03, 30.79it/s]



Processing ara
Generating fact check embeddings...
Processing posts...


78it [00:02, 27.54it/s]



Saving predictions...


### distilBERT

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
from sklearn.metrics.pairwise import cosine_similarity
import re
import os
from sentence_transformers import SentenceTransformer

class DistilBERTRetriever:
    def __init__(self, model_name='distilbert-base-nli-stsb-mean-tokens'):
        """Initialize DistilBERT retriever with specified model."""
        self.model = SentenceTransformer(model_name)
    
    def preprocess_text(self, text):
        """Clean and preprocess text."""
        if pd.isna(text):
            return ""
        # Convert to string
        text = str(text)
        # Remove special characters
        text = re.sub(r'[^\w\s]', ' ', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text.lower()
    
    def get_embeddings(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state[:, 0, :].numpy()
    
    def get_embeddings(self, texts, batch_size=32):
        """Get DistilBERT embeddings for a batch of texts."""
        # Clean texts
        processed_texts = [self.preprocess_text(text) for text in texts]
        # Replace empty texts with a space to avoid errors
        processed_texts = [text if text else " " for text in processed_texts]
        # Use model's built-in batching with show_progress_bar
        return self.model.encode(processed_texts, batch_size=batch_size, show_progress_bar=True)

def process_fact_check(row):
    """Process fact check text from row."""
    try:
        claim = str(row.get('claim', ''))
        title = str(row.get('title', ''))
        return f"{claim} {title}"
    except:
        return ""

def process_post(row):
    """Process post text from row."""
    try:
        return str(row.get('text', ''))
    except:
        return ""

def main():
    print("Loading DistilBERT retriever...")
    retriever = DistilBERTRetriever()
    
    print("Loading datasets...")
    fact_checks = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/fact_checks.csv')
    posts = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/posts.csv')
    
    with open('tasks.json') as f:
        tasks = json.load(f)
    
    all_predictions = {}
    
    for language in tasks['monolingual'].keys():
        print(f"\nProcessing {language}")
        
        # Filter fact checks for language
        valid_fact_checks = tasks['monolingual'][language]['fact_checks']
        language_fact_checks = fact_checks[fact_checks['fact_check_id'].isin(valid_fact_checks)]
        
        # Get fact check embeddings
        print("Generating fact check embeddings...")
        fact_check_texts = [process_fact_check(row) for _, row in language_fact_checks.iterrows()]
        fact_check_vectors = retriever.get_embeddings(fact_check_texts)
        fact_check_ids = language_fact_checks['fact_check_id'].tolist()
        
        # Process dev posts
        dev_post_ids = tasks['monolingual'][language]['posts_dev']
        dev_posts = posts[posts['post_id'].isin(dev_post_ids)]
        
        print("Processing posts...")
        for _, post in tqdm(dev_posts.iterrows()):
            post_text = process_post(post)
            post_vector = retriever.get_embeddings([post_text])
            
            # Calculate similarities and get top matches
            similarities = cosine_similarity(post_vector, fact_check_vectors).flatten()
            top_indices = np.argsort(similarities)[-10:][::-1]
            
            # Store predictions
            post_id = str(int(post['post_id']))
            predictions = [str(fact_check_ids[i]) for i in top_indices]
            all_predictions[post_id] = predictions
    
    print("\nSaving predictions...")
    os.makedirs('predictions', exist_ok=True)
    with open('predictions/monolingual_predictions_distilBERT.json', 'w') as f:
        json.dump(all_predictions, f)

if __name__ == "__main__":
    main()
    
# Runtime: 4 minutes

In [None]:
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import CrossEncoder
import torch
import json
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

class EnhancedRetriever:
    def __init__(self):
        # Bi-encoder for initial retrieval
        self.tokenizer = AutoTokenizer.from_pretrained('microsoft/mdeberta-v3-base')
        self.model = AutoModel.from_pretrained('microsoft/mdeberta-v3-base')
        # Cross-encoder for reranking
        self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    
    def get_embeddings(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state[:, 0, :].numpy()
    
    def rerank(self, query, candidates):
        pairs = [[query, candidate] for candidate in candidates]
        scores = self.cross_encoder.predict(pairs)
        return scores

def process_fact_check(row):
    """Process fact check text from row."""
    try:
        claim = str(row.get('claim', ''))
        title = str(row.get('title', ''))
        return f"{claim} {title}"
    except:
        return ""

def process_post(row):
    """Process post text from row."""
    try:
        return str(row.get('text', ''))
    except:
        return ""

def main():
    retriever = EnhancedRetriever()
    # ...existing code...
    # After getting top_indices:
    print("Loading datasets...")
    fact_checks = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/fact_checks.csv')
    posts = pd.read_csv('/home/csgrads/syed0093/SemEval_Task7/Task_Data/posts.csv')
    
    with open('tasks.json') as f:
        tasks = json.load(f)
    
    all_predictions = {}
    
    for language in tasks['monolingual'].keys():
        print(f"\nProcessing {language}")
        
        # Filter fact checks for language
        valid_fact_checks = tasks['monolingual'][language]['fact_checks']
        language_fact_checks = fact_checks[fact_checks['fact_check_id'].isin(valid_fact_checks)]
        
        # Get fact check embeddings
        print("Generating fact check embeddings...")
        fact_check_texts = [process_fact_check(row) for _, row in language_fact_checks.iterrows()]
        fact_check_vectors = retriever.get_embeddings(fact_check_texts)
        fact_check_ids = language_fact_checks['fact_check_id'].tolist()
        
        # Process dev posts
        dev_post_ids = tasks['monolingual'][language]['posts_dev']
        dev_posts = posts[posts['post_id'].isin(dev_post_ids)]
        
        print("Processing posts...")
        for _, post in tqdm(dev_posts.iterrows()):
            post_text = process_post(post)
            post_vector = retriever.get_embeddings([post_text])
            
            # Calculate similarities and get top matches
            similarities = cosine_similarity(post_vector, fact_check_vectors).flatten()
            top_indices = np.argsort(similarities)[-10:][::-1]
            
            # Store predictions
            post_id = str(int(post['post_id']))
            reranked_scores = retriever.rerank(post_text, [fact_checks.iloc[i] for i in top_indices])
            final_indices = top_indices[np.argsort(reranked_scores)[::-1]]
            predictions = [str(fact_check_ids[i]) for i in final_indices]
            all_predictions[post_id] = predictions
    
    print("\nSaving predictions...")
    os.makedirs('predictions', exist_ok=True)
    with open('predictions/monolingual_predictions_enhanced_retriever.json', 'w') as f:
        json.dump(all_predictions, f)

if __name__ == "__main__":
    main()



Loading datasets...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Processing fra
Generating fact check embeddings...
