## Pre-Processing 

In [5]:
"""
This is just a quick script that is able to load the files. Just using pandas can be tricky because of the newline characters in the text data. Here it is handled via the `parse_col` method.
"""

import ast
import os
import re
import pandas as pd

our_dataset_path = '.'

posts_path = os.path.join(our_dataset_path, 'posts.csv')
fact_checks_path = os.path.join(our_dataset_path, 'fact_checks.csv')
fact_check_post_mapping_path = os.path.join(our_dataset_path, 'pairs.csv')

for path in [posts_path, fact_checks_path, fact_check_post_mapping_path]:
    assert os.path.isfile(path)

# We need to apply t = t.replace('\n', '\\n') for text fields before using `ast.literal_eval`.
# `ast.literal_eval` has problems when there are new lines in the text, e.g.:
# `ast.literal_eval('("\n")')` effectively tries to interpret the following code:

# ```
# ("
# ")
# ```

# This raises a SyntaxError exception. By escaping new lines we are able to force it to interpret it properly. There might
# be some other way to do this more systematically, but it is a workable fix for now.

parse_col = lambda s: ast.literal_eval(s.replace('\n', '\\n')) if s else s

df_fact_checks = pd.read_csv(fact_checks_path).fillna('').set_index('fact_check_id')
for col in ['claim', 'instances', 'title']:
    df_fact_checks[col] = df_fact_checks[col].apply(parse_col)


df_posts = pd.read_csv(posts_path).fillna('').set_index('post_id')
for col in ['instances', 'ocr', 'verdicts', 'text']:
    df_posts[col] = df_posts[col].apply(parse_col)


df_fact_check_post_mapping = pd.read_csv(fact_check_post_mapping_path) 

In [6]:
#load tasks.json file and from it extract monlingual  posts_train, posts_dev, posts_test, fact_checks_train, fact_checks_dev, posts_dev
import json
with open('tasks.json') as f:
    data_tasks = json.load(f)

data_tasks = data_tasks['monolingual']

In [7]:
#load tasks.json file and from it extract monlingual  posts_train, posts_dev, posts_test, fact_checks_train, fact_checks_dev, posts_dev
import json
with open('tasks.json') as f:
    data_tasks = json.load(f)

data_tasks = data_tasks['monolingual']

#make them dataframes
fact_checks_ = []
posts__train = []
posts__dev = []
for key , value in data_tasks.items():
    #append the values of the array to the list no the list itself
    fact_checks_.extend(value['fact_checks'])
    posts__train.extend(value['posts_train'])
    posts__dev.extend(value['posts_dev'])

In [8]:
len(fact_checks_), len(posts__train), len(posts__dev)

(153743, 17016, 1891)

In [9]:
with open('monolingual_predictions.json') as f:
    monolingual_predictions_file = json.load(f)



In [10]:
# df_fact_checks_ where fact_check_id is in fact_checks_ list . use fact_check_id as index
df_fact_checks_ = df_fact_checks[df_fact_checks.index.isin(fact_checks_)]
from sklearn.model_selection import train_test_split
posts__train, posts__validate = train_test_split(posts__train, test_size=0.2, random_state=42)
df_posts__train = df_posts[df_posts.index.isin(posts__train)]
df_posts__validate = df_posts[df_posts.index.isin(posts__validate)]
df_posts__dev = df_posts[df_posts.index.isin(posts__dev)]

#inner join df_posts__train on index and df_fact_check_post_mapping on post_id

In [11]:
df_posts__dev.head()

Unnamed: 0_level_0,instances,ocr,verdicts,text
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"[(1586139153.0, fb)]",[(!! WARNING !! A new thing circulating now. P...,[False information],
48,"[(1657229688.0, fb)]","[(""Un pueblo que elige corruptos, impostores, ...",[Partly false information],
61,"[(1630863318.0, fb)]",[(#Baerbock bei #1 Live im Radio „Wir müssen d...,[False information],
122,"[(1627342737.0, fb)]",[(07/21/2021: Lab Alert: Changes to CDC RT-PCR...,[False information],
124,"[(1605538417.0, fb)]",[(07:14 f Vejam que MARAVILHA está acontecendo...,[Missing context],


In [12]:
#print the ocr of the first post
df_posts__dev.iloc[235]['text'][1]

'#AsianPneumonia'

In [13]:
#add a colums data to df_posts__train, df_posts__validate, df_posts__dev 
#where if text!= '' then data = text else data = ocr. if text==" " and ocr==" " then remove the row
df_posts__train['text'] = df_posts__train.apply(lambda x: x['text'][1] if x['text'] != '' else '', axis=1)
df_posts__validate['text'] = df_posts__validate.apply(lambda x: x['text'][1] if x['text'] != '' else '', axis=1)
df_posts__dev['text'] = df_posts__dev.apply(lambda x: x['text'][1] if x['text'] != '' else '', axis=1)
df_posts__train['ocr'] = df_posts__train.apply(lambda x: x['ocr'][0][1] if len(x['ocr'])!=0 else '', axis=1)
df_posts__validate['ocr'] = df_posts__validate.apply(lambda x: x['ocr'][0][1] if len(x['ocr'])!=0 else '', axis=1)
df_posts__dev['ocr'] = df_posts__dev.apply(lambda x: x['ocr'][0][1] if len(x['ocr'])!=0 else '', axis=1)
df_posts__train['data'] = df_posts__train.apply(lambda x: x['text'] if x['text'] != '' else x['ocr'], axis=1)
df_posts__train = df_posts__train[df_posts__train['data']!='']
df_posts__validate['data'] = df_posts__validate.apply(lambda x: x['text'] if x['text'] != '' else x['ocr'], axis=1)
df_posts__validate = df_posts__validate[df_posts__validate['data']!='']
df_posts__dev['data'] = df_posts__dev.apply(lambda x: x['text'] if x['text'] != '' else x['ocr'], axis=1)
df_posts__dev = df_posts__dev[df_posts__dev['data']!='']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_posts__train['text'] = df_posts__train.apply(lambda x: x['text'][1] if x['text'] != '' else '', axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_posts__validate['text'] = df_posts__validate.apply(lambda x: x['text'][1] if x['text'] != '' else '', axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

In [14]:
import re
import string


simple_latin = string.ascii_lowercase + string.ascii_uppercase
dirty_chars = string.digits + string.punctuation


def is_clean_text(text: str) -> bool:
    """
    Simple text cleaning method.
    """
    dirty = (
        len(text) < 25                                               # Short text
        or
        0.5 < sum(char in dirty_chars for char in text) / len(text)  # More than 50% dirty chars                                            
    )
    return not dirty


url_regex = re.compile(
    r'(?:^|(?<![\w\/\.]))'
    r'(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))'
    r'(?:\S+(?::\S*)?@)?' r'(?:'
    r'(?!(?:10|127)(?:\.\d{1,3}){3})'
    r'(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})'
    r'(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})'
    r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])'
    r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}'
    r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))'
    r'|'
    r'(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)'
    r'(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*'
    r'(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))' r'|' r'(?:(localhost))' r')'
    r'(?::\d{2,5})?'
    r'(?:\/[^\)\]\}\s]*)?',
    flags=re.IGNORECASE,
)


def remove_urls(text: str) -> str:
    return url_regex.sub('', text)


# Source: https://gist.github.com/Nikitha2309/15337f4f593c4a21fb0965804755c41d
emoji_regex = re.compile('['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002500-\U00002BEF'  # chinese char
        u'\U00002702-\U000027B0'
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        u'\U0001f926-\U0001f937'
        u'\U00010000-\U0010ffff'
        u'\u2640-\u2642'
        u'\u2600-\u2B55'
        u'\u200d'
        u'\u23cf'
        u'\u23e9'
        u'\u231a'
        u'\ufe0f'  # dingbats
        u'\u3030'
    ']+')


def remove_emojis(text: str) -> str:
    return emoji_regex.sub('', text)


sentence_stop_regex = re.compile('['
    u'\u002e' # full stop
    u'\u2026' # ellipsis
    u'\u061F' # arabic question mark
    u'\u06D4' # arabic full stop
    u'\u2022' # bullet point
    u'\u3002' # chinese period
    u'\u25CB' # white circle
    '\|'      # pipe
']+')


def replace_stops(text: str) -> str:
    """
    Replaces some characters that are being used to end sentences. Used for sentence segmentation with sliding windows.
    """
    return sentence_stop_regex.sub('.', text)


whitespace_regex = re.compile(r'\s+')


def replace_whitespaces(text: str) -> str:
    return whitespace_regex.sub(' ', text)


def clean_ocr(ocr: str) -> str:
    """
    Remove all lines that are shorter than 6 and have more than 50% `dirty_chars`.
    """
    return '\n'.join(
        line
        for line in ocr.split('\n')
        if len(line) > 5 and sum(char in dirty_chars for char in line) / len(line) < 0.5
    )


def clean_twitter_picture_links(text):
    """
    Replaces links to picture in twitter post only with 'pic'. 
    """
    return re.sub(r'pic.twitter.com/\S+', 'pic', text)


def clean_twitter_links(text):
    """
    Replaces twitter links with 't.co'.
    """
    return re.sub(r'\S+//t.co/\S+', 't.co', text)


def remove_elongation(text):
    """
    Replaces any occurrence of a string of consecutive identical non-space 
    characters (at least three in a row) with just one instance of that character.
    """
    text = re.sub(r'(\S+)\1{2,}', r'\1', text)
    return text

In [15]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = remove_urls(text)

    # Remove emojis
    text = remove_emojis(text)

    # Replace sentence stops with a period
    text = replace_stops(text)

    # Replace multiple whitespaces with a single space
    text = replace_whitespaces(text)

    # Remove lines shorter than 6 and with more than 50% dirty characters
    text = clean_ocr(text)

    # Replace Twitter picture links with 'pic'
    text = clean_twitter_picture_links(text)

    # Replace Twitter links with 't.co'
    text = clean_twitter_links(text)

    # Remove elongation (repeated characters)
    text = remove_elongation(text)

    return text


In [16]:
import re
import nltk
from nltk.stem import PorterStemmer
# def preprocess_text(text):
#     #lowercase
    
#     text = text.lower()
#     #remove special characters
#     text = re.sub(r'[^a-z\s]', '', text)
#     stemmer = PorterStemmer()  
#     text = ' '.join([stemmer.stem(word) for word in text.split()]) 
#     return text

df_posts__train['data'] = df_posts__train['data'].apply(preprocess_text)
df_posts__validate['data'] = df_posts__validate['data'].apply(preprocess_text)
df_posts__dev['data'] = df_posts__dev['data'].apply(preprocess_text)



In [17]:
df_posts__dev.head()

Unnamed: 0_level_0,instances,ocr,verdicts,text,data
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"[(1586139153.0, fb)]",!! WARNING !! A new thing circulating now. Peo...,[False information],,!! warning !! a new thing circulating now. peo...
48,"[(1657229688.0, fb)]","""A people who choose corrupt, impostors, thiev...",[Partly false information],,"""a people who choose corrupt, impostors, thiev..."
61,"[(1630863318.0, fb)]",#Baerbock at #1 Live im radio “We have to adva...,[False information],,#baerbock at #1 live im radio “we have to adva...
122,"[(1627342737.0, fb)]",07/21/2021: Lab Alert: Changes to CDC RT-PCR f...,[False information],,07/21/2021: lab alert: changes to cdc rt-pcr f...
124,"[(1605538417.0, fb)]",07:14 pm Look how WONDERFUL it is happening in...,[Missing context],,07:14 pm look how wonderful it is happening in...


In [18]:
df_fact_check_post_mapping = df_fact_check_post_mapping.set_index('post_id')
df_posts__train = df_posts__train.join(df_fact_check_post_mapping, how='inner')

#inner join df_posts__validate on index and df_fact_check_post_mapping on post_id
df_posts__validate = df_posts__validate.join(df_fact_check_post_mapping, how='inner')
# in column claim of df_fact_checks_, extract the second element of the list and replace the column claim with this element
df_fact_checks_["claim"] = df_fact_checks_["claim"].apply(lambda x: x[1])
df_fact_checks_.drop(columns=['instances', 'title'], inplace=True)


In [19]:
df_fact_checks_.iloc[12]

claim    !! He drinks imported mineral water, Evian boy...
Name: 12, dtype: object

In [20]:
df_posts__dev.drop(columns=['instances', 'ocr', 'verdicts', 'text'], inplace=True)
df_posts__dev['post_id'] = df_posts__dev.index
df_posts__dev.reset_index(drop=True, inplace=True)
df_posts__dev.head()

Unnamed: 0,data,post_id
0,!! warning !! a new thing circulating now. peo...,1
1,"""a people who choose corrupt, impostors, thiev...",48
2,#baerbock at #1 live im radio “we have to adva...,61
3,07/21/2021: lab alert: changes to cdc rt-pcr f...,122
4,07:14 pm look how wonderful it is happening in...,124


In [21]:
df_posts__train.drop(columns=['instances', 'verdicts','ocr','text'], inplace=True)
#add a column claim to df_posts__train from df_fact_checks_
df_posts__train = df_posts__train.join(df_fact_checks_, on='fact_check_id', how='inner')
#add a colmun post_id to df_posts__train from index
df_posts__train['post_id'] = df_posts__train.index
#reset index of df_posts__train
df_posts__train.reset_index(drop=True, inplace=True)
#make post od as first column
df_posts__train = df_posts__train[['post_id','data','fact_check_id', 'claim']]

# in column ocr of df_posts__train, extract the second element of the list and replace the column ocr with this element
df_posts__train.head()

Unnamed: 0,post_id,data,fact_check_id,claim
0,0,! brazen vaccination fake by markus söder! it'...,87108,Markus Söder faked his vaccination.
1,4,"""blessed are those persecuted by me cause ""the...",80729,The photo shows nuns arrested for participatin...
2,5,"""cigarette smoking does not cause cancer."" -ce...",33862,CDC said cigarette smoking doesn't cause cance...
3,6,"""eat what you want on easter, the sacrifice is...",50769,"The Uruguayan priest ""Gordo"" Verde said to ""ea..."
4,9,"""the need will die force people to bend!"" that...",150241,"Wolfgang Schäuble said: ""Necessity will force ..."


In [22]:
#do same for df_posts__validate
df_posts__validate.drop(columns=['instances', 'verdicts','ocr','text'], inplace=True)
df_posts__validate = df_posts__validate.join(df_fact_checks_, on='fact_check_id', how='inner')
df_posts__validate['post_id'] = df_posts__validate.index
df_posts__validate.reset_index(drop=True, inplace=True)
df_posts__validate = df_posts__validate[['post_id','data', 'fact_check_id', 'claim']]

df_posts__validate.head()

Unnamed: 0,post_id,data,fact_check_id,claim
0,2,"""actually, he's a damn sight better than any o...",93524,New Zealand opposition leader Judith Collins p...
1,7,"speech by pedro castillo, it was based on the ...",56968,Felipe VI said that Pedro Castillo's speech is...
2,8,"""we must be solidarity with those who least ha...",148668,"Wado de Pedro: ""We must be in Solidarity with ..."
3,15,"""environmentalists"" say fracking is evil this ...",153628,environmentalists hypocritical over stance on ...
4,17,"""out of work, everything will be forbidden wal...",52407,"In ""1984"", Orwell wrote that ""having fun, sing..."


In [23]:
df_fact_checks_['fact_check_id'] = df_fact_checks_.index
df_fact_checks_.reset_index(drop=True, inplace=True)
df_fact_checks_['claim'] = df_fact_checks_['claim'].apply(preprocess_text)

In [24]:
df_fact_checks_.head()

Unnamed: 0,claim,fact_check_id
0,are avocados good for you?,0
1,can animals have headaches?,1
2,can we help prevent alzheimer's with diet?,2
3,do any benefits of alcohol outweigh the risks?,3
4,does acupuncture work for headaches?,4


In [25]:
df_posts__train.head()

Unnamed: 0,post_id,data,fact_check_id,claim
0,0,! brazen vaccination fake by markus söder! it'...,87108,Markus Söder faked his vaccination.
1,4,"""blessed are those persecuted by me cause ""the...",80729,The photo shows nuns arrested for participatin...
2,5,"""cigarette smoking does not cause cancer."" -ce...",33862,CDC said cigarette smoking doesn't cause cance...
3,6,"""eat what you want on easter, the sacrifice is...",50769,"The Uruguayan priest ""Gordo"" Verde said to ""ea..."
4,9,"""the need will die force people to bend!"" that...",150241,"Wolfgang Schäuble said: ""Necessity will force ..."


In [26]:
df_posts__train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15996 entries, 0 to 15995
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   post_id        15996 non-null  int64 
 1   data           15996 non-null  object
 2   fact_check_id  15996 non-null  int64 
 3   claim          15996 non-null  object
dtypes: int64(2), object(2)
memory usage: 500.0+ KB


### BM25 API

In [59]:
import pandas as pd
from rank_bm25 import BM25Okapi
from sklearn.metrics import accuracy_score
from typing import List, Dict

class BM25Retriever:
    def __init__(self, k1: float = 1.2, b: float = 0.75):
        self.k1 = k1
        self.b = b
        self.bm25 = None
        
    def fit(self, df_fact_checks: pd.DataFrame):
        """Fit the BM25 model with fact-check claims."""
        self.fact_check_ids = df_fact_checks['fact_check_id'].tolist()
        tokenized_claims = [claim.split() for claim in df_fact_checks_['claim']]
        self.bm25 = BM25Okapi(tokenized_claims, k1=self.k1, b=self.b)

    def retrieve_top_k(self, post_data: str, k) -> List[int]:
        """Retrieve the top k fact_check_ids for a given post."""
        tokenized_post = post_data.split()
        scores = self.bm25.get_scores(tokenized_post)
        top_k_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
        return top_k_indices


class BM25Evaluator:
    def __init__(self, retriever: BM25Retriever):
        self.retriever = retriever

    def evaluate_success_at_10(self, df_posts__validate: pd.DataFrame) -> Dict[str, float]:
        """Evaluate the success@10 metric and return the average score and top 10 fact_check_ids."""
        success_at_10 = 0
        top_10_results = {}

        for _, row in df_posts__validate.iterrows():
            post_id = row['post_id']
            # correct_fact_id = row['fact_check_id']
            retrieved_fact_ids = self.retriever.retrieve_top_k(row['data'], k=10)

            # Check if the correct fact ID is in the top 10 retrieved results
            if row['fact_check_id'] in retrieved_fact_ids:
                success_at_10 += 1

        #     # Store the retrieved fact IDs for this post
            top_10_results[post_id] = retrieved_fact_ids

        # Calculate the average success@10 score
        avg_success_at_10 = success_at_10 / len(df_posts__validate)
        
        # return {'average_score': avg_success_at_10, 'top_10_results': top_10_results}
        return top_10_results , avg_success_at_10

# bm25_retriever = BM25Retriever(k1=1.2, b=0.75)
# bm25_retriever.fit(df_fact_checks_)
# with open('BM25API.pkl', 'wb') as f:
#     pickle.dump(bm25_retriever, f)
# Initialize retriever with BM25 param

In [2]:
#save model to file 
import pickle


bm25_retriever_loaded = pickle.load(open('BM25API.pkl', 'rb'))
# Initialize evaluator and evaluate success@10
evaluator = BM25Evaluator(bm25_retriever_loaded)
results ,top10 = evaluator.evaluate_success_at_10(df_posts__validate)
top10



0.573456674


### FAISS

In [3]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score
import os
import pickle

# Load your data
# df_fact_checks_ = pd.DataFrame(...) # Contains columns 'claim' and 'fact_check_id'
# df_posts__train = pd.DataFrame(...) # Contains columns 'post_id', 'data', and 'fact_check_id'

# Initialize the sentence encoder model (e.g., "all-MiniLM-L6-v2")
model = SentenceTransformer('all-MiniLM-L6-v2')

# Path to store the FAISS index file
index_file_path = 'fact_check_index.faiss'
id_to_fact_check = {}  # Dictionary to store the mapping from fact check index to fact_check_id

# Step 1: Check if the FAISS index already exists, otherwise create it
if not os.path.exists(index_file_path):
    # Encode claims and build the vector database
    claim_embeddings = model.encode(df_fact_checks_['claim'].tolist(), convert_to_tensor=False)
    claim_embeddings = np.array(claim_embeddings).astype('float32')
    
    # Step 2: Initialize FAISS index
    dimension = claim_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    
    # Add claim embeddings to index
    index.add(claim_embeddings)
    
    # Save fact_check_id metadata in id_to_fact_check dictionary
    id_to_fact_check = {i: fact_id for i, fact_id in enumerate(df_fact_checks_['fact_check_id'].tolist())}
    
    # Save the FAISS index and metadata
    faiss.write_index(index, index_file_path)
    with open('fact_check_id_mapping.pkl', 'wb') as f:
        pickle.dump(id_to_fact_check, f)
else:
    # Load the FAISS index and metadata if already created
    index = faiss.read_index(index_file_path)
    with open('fact_check_id_mapping.pkl', 'rb') as f:
        id_to_fact_check = pickle.load(f)

# Step 3: Prepare to evaluate using Success@10
top_10_results = {}  # Dictionary to store results for each post

# Encode each post as query
for _, post in df_posts__validate.iterrows():
    post_id = post['post_id']
    correct_fact_id = post['fact_check_id']
    
    # Get post embedding
    post_embedding = model.encode([post['data']], convert_to_tensor=False)
    post_embedding = np.array(post_embedding).astype('float32')
    
    # Step 4: Retrieve top 10 fact_check_ids for the post
    _, top_10_indices = index.search(post_embedding, k=10)
    top_10_fact_check_ids = [id_to_fact_check[idx] for idx in top_10_indices[0]]
    top_10_results[post_id] = top_10_fact_check_ids
    
    # Check if the correct fact_check_id is in the top 10 results
    top_10_results[post_id] = 1 if correct_fact_id in top_10_fact_check_ids else 0

# Step 5: Calculate the average score
average_score = np.mean(list(top_10_results.values()))

# Results
print("Average Success@10 Score:", average_score)
# print("Top 10 Fact Check IDs for each post:", top_10_results)

# Return the results
# average_score, top_10_results


Average Success@10 Score: 0.632335235


In [4]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
import pickle

# Load your data
# df_fact_checks_ = pd.DataFrame(...) # Contains columns 'claim' and 'fact_check_id'
# df_posts__validate = pd.DataFrame(...) # Contains columns 'post_id', 'data', and 'fact_check_id'

# Initialize the sentence encoder model (e.g., "all-mpnet-base-v2") and move to GPU
model = SentenceTransformer('all-mpnet-base-v2')
model.to('cuda')  # Move to GPU for faster encoding

# Path to store the FAISS index file
index_file_path = 'fact_check_index_mpnet.faiss'
id_to_fact_check = {}  # Dictionary to store the mapping from fact check index to fact_check_id

# Step 1: Check if the FAISS index already exists, otherwise create it
if not os.path.exists(index_file_path):
    # Encode claims and build the vector database
    claim_embeddings = model.encode(
        df_fact_checks_['claim'].tolist(), 
        convert_to_tensor=False,
        device='cuda'  # Use GPU for encoding
    )
    claim_embeddings = np.array(claim_embeddings).astype('float32')

    # Step 2: Initialize FAISS index on GPU
    dimension = claim_embeddings.shape[1]
    res = faiss.StandardGpuResources()  # Initialize GPU resources
    index_flat = faiss.IndexFlatL2(dimension)  # Base CPU index
    index = faiss.index_cpu_to_gpu(res, 0, index_flat)  # Move to GPU
    
    # Add claim embeddings to the FAISS index
    index.add(claim_embeddings)
    
    # Save fact_check_id metadata in id_to_fact_check dictionary
    id_to_fact_check = {i: fact_id for i, fact_id in enumerate(df_fact_checks_['fact_check_id'].tolist())}
    
    # Save the FAISS index and metadata
    faiss.write_index(faiss.index_gpu_to_cpu(index), index_file_path)  # Save as CPU index
    with open('fact_check_id_mapping_mpnet.pkl', 'wb') as f:
        pickle.dump(id_to_fact_check, f)
else:
    # Load the FAISS index and metadata if already created
    index = faiss.read_index(index_file_path)
    with open('fact_check_id_mapping_mpnet.pkl', 'rb') as f:
        id_to_fact_check = pickle.load(f)

    # Move the index back to GPU
    res = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(res, 0, index)

# Step 3: Prepare to evaluate using Success@10
top_10_results = {}  # Dictionary to store results for each post

# Encode each post as query
for _, post in df_posts__validate.iterrows():
    post_id = post['post_id']
    correct_fact_id = post['fact_check_id']
    
    # Get post embedding using GPU
    post_embedding = model.encode(
        [post['data']], 
        convert_to_tensor=False,
        device='cuda'  # Use GPU for encoding
    )
    post_embedding = np.array(post_embedding).astype('float32')
    
    # Step 4: Retrieve top 10 fact_check_ids for the post using FAISS GPU index
    _, top_10_indices = index.search(post_embedding, k=10)
    top_10_fact_check_ids = [id_to_fact_check[idx] for idx in top_10_indices[0]]
    top_10_results[post_id] = top_10_fact_check_ids
    
    # Check if the correct fact_check_id is in the top 10 results
    top_10_results[post_id] = 1 if correct_fact_id in top_10_fact_check_ids else 0

# Step 5: Calculate the average score
average_score = np.mean(list(top_10_results.values()))

# Results
print("Average Success@10 Score:", average_score)



Average Success@10 Score: 0.653462345


In [33]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score
import os
import pickle

# Load your data
# df_fact_checks_ = pd.DataFrame(...) # Contains columns 'claim' and 'fact_check_id'
# df_posts__train = pd.DataFrame(...) # Contains columns 'post_id', 'data', and 'fact_check_id'

# Initialize the sentence encoder model (e.g., "all-MiniLM-L6-v2")
model = SentenceTransformer('all-MiniLM-L6-v2')

# Path to store the FAISS index file
index_file_path = 'fact_check_index.faiss'
# id_to_fact_check = {}  # Dictionary to store the mapping from fact check index to fact_check_id
index = faiss.read_index(index_file_path)
with open('fact_check_id_mapping.pkl', 'rb') as f:
        id_to_fact_check = pickle.load(f)






In [40]:
query = "The fake news of red sky in china is floating on twitter"    
post_embedding = model.encode(query, convert_to_tensor=False)
post_embedding = np.array(post_embedding).astype('float32')

# Step 4: Retrieve top 10 documents for the query
distances, indices = index.search(post_embedding.reshape(1, -1), 10)

top_claims = [df_fact_checks_.iloc[idx]['claim'] for idx in indices[0]]
actual_claim = "images of red sky in china are real."
print("Actual Claim:", actual_claim)
print("Top 10 Retrieved Claims:", )
top_claims

Actual Claim: images of red sky in china are real.
Top 10 Retrieved Claims:


['images of red sky in china are real.',
 'the sky of china is red in a strange phenomenon',
 'picture of the sky in china turning red.',
 'panic at the airport in shanghai - and nobody reports about it? a video shows people in protective suits trying to hold back a crowd. a facebook post about it says: "the fake media don\'t show that!" (archived here) what the user means by "fakemedien", he clarifies in another post: "world, rtl, ntv".',
 '“so now the fake news @nytimes is tracing the coronavirus origins back to europe, not china."',
 'image of newspaper showing trump bombing south china sea',
 'the sky turned blood red in china',
 "news broadcaster admiring china's response to deadly flooding in henan",
 'a marine tsunami hits china during the past few hours in a live broadcast on cnn on the social networking site facebook',
 'the video shows recent flooding at an airport in china']

In [24]:
    #save the claim_embeddings
with open('_before_faiss_claim_embeddings_mpnet.pkl', 'wb') as f:
        pickle.dump(claim_embeddings, f)

## Dual Encoder 

In [1]:
import torch 
import torch.nn as nn
from transformers import AutoModel
from transformers import AutoTokenizer

# Dual Encoder Model with Separate Encoders
class DualEncoder(nn.Module):
    def __init__(self, query_model_name, doc_model_name=None):
        """
        query_model_name: Pre-trained model for query encoder.
        doc_model_name: Pre-trained model for document encoder (defaults to query_model_name if None).
        """
        super(DualEncoder, self).__init__()
        self.query_encoder = AutoModel.from_pretrained(query_model_name)
        self.doc_encoder = AutoModel.from_pretrained(doc_model_name or query_model_name)

    def encode_query(self, query_inputs):
        """
        Encode query using the query encoder.
        """
        query_embeddings = self.query_encoder(**query_inputs).last_hidden_state[:, 0, :]
        query_embeddings = nn.functional.normalize(query_embeddings, p=2, dim=1)
        return query_embeddings

    def encode_document(self, doc_inputs):
        """
        Encode document using the document encoder.
        """
        doc_embeddings = self.doc_encoder(**doc_inputs).last_hidden_state[:, 0, :]
        doc_embeddings = nn.functional.normalize(doc_embeddings, p=2, dim=1)
        return doc_embeddings

    def forward(self, query_inputs, doc_inputs):
        """
        Forward pass to encode both query and document.
        """
        query_embeddings = self.encode_query(query_inputs)
        doc_embeddings = self.encode_document(doc_inputs)
        return query_embeddings, doc_embeddings
        

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
device = "cuda"
# dataset = TripletDataset(training_data, tokenizer)

# Initialize model
model_name = "sentence-transformers/all-mpnet-base-v2"
model = DualEncoder(model_name)
model.load_state_dict(torch.load("best_dual_encoder_mpnet.pt"))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize the model

model.to(device)


DualEncoder(
  (query_encoder): MPNetModel(
    (embeddings): MPNetEmbeddings(
      (word_embeddings): Embedding(30527, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): MPNetEncoder(
      (layer): ModuleList(
        (0-11): 12 x MPNetLayer(
          (attention): MPNetAttention(
            (attn): MPNetSelfAttention(
              (q): Linear(in_features=768, out_features=768, bias=True)
              (k): Linear(in_features=768, out_features=768, bias=True)
              (v): Linear(in_features=768, out_features=768, bias=True)
              (o): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )


In [29]:
from torch.utils.data import DataLoader
from tqdm import tqdm
def predict_with_sentences(model, posts, df_fact_checks_, tokenizer, batch_size=32, device='cuda', top_k=10):
    # Move model to the correct device
    model.to(device)
    model.eval()

    # Step 1: Compute fact-check embeddings in batches
    idx_to_fact_check_id = {idx: fact_check_id for idx, fact_check_id in enumerate(df_fact_checks_.index.tolist())}
    idx_to_fact_check_sentence = {idx: claim for idx, claim in enumerate(df_fact_checks_['claim'].tolist())}
    fact_check_texts = df_fact_checks_['claim'].tolist()
    all_facts_embeddings = []
    
    fact_check_loader = DataLoader(
        fact_check_texts,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda x: tokenizer(x, padding=True, truncation=True, max_length=128, return_tensors="pt")
    )

    with torch.no_grad():
        for batch in tqdm(fact_check_loader, desc="Encoding Fact-Checks"):
            batch = {key: val.to(device) for key, val in batch.items()}
            embeddings = model.encode_document(batch).cpu()
            all_facts_embeddings.append(embeddings)
    
    all_facts_embeddings = torch.cat(all_facts_embeddings, dim=0)  # Shape: [num_facts, embedding_dim]
    
    # Normalize fact-check embeddings
    all_facts_embeddings = all_facts_embeddings / torch.norm(all_facts_embeddings, dim=1, keepdim=True)

    # Step 2: Compute post embeddings and retrieve top-k fact-checks
    top_k_results = {}
    post_loader = DataLoader(
        posts,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda x: tokenizer(x, padding=True, truncation=True, max_length=128, return_tensors="pt")
    )

    with torch.no_grad():
        for i, batch in enumerate(tqdm(post_loader, desc="Predicting Posts")):
            batch = {key: val.to(device) for key, val in batch.items()}
            post_embeddings = model.encode_query(batch).cpu()

            # Normalize post embeddings
            post_embeddings = post_embeddings / torch.norm(post_embeddings, dim=1, keepdim=True)

            # Compute cosine similarity: [batch_size, num_facts]
            similarities = torch.matmul(post_embeddings, all_facts_embeddings.T)

            # Extract top-k indices and map to fact-check IDs and sentences
            for j, similarity_scores in enumerate(similarities):
                post_idx = i * batch_size + j  # Index of the current post in the input list
                top_k_indices = similarity_scores.topk(top_k).indices.tolist()
                top_k_fact_checks = [
                    {
                        "fact_check_id": idx_to_fact_check_id[idx],
                        "sentence": idx_to_fact_check_sentence[idx]
                    }
                    for idx in top_k_indices
                ]
                top_k_results[post_idx] = top_k_fact_checks

    return top_k_results


In [30]:
# Example inputs

posts = ["The fake news of red sky in china is floating on twitter"]  # Replace with actual post texts

# Predict top-k fact-checks for the given posts
predictions = predict_with_sentences(model, posts, df_fact_checks_, tokenizer, top_k=5)

# Output predictions in a nicely formatted way
for post_idx, top_k_fact_checks in predictions.items():
    print(f"Post {post_idx}:")
    for rank, fact in enumerate(top_k_fact_checks, start=1):
        print(f"  Rank {rank}:")
        print(f"    Fact-check ID: {fact['fact_check_id']}")
        print(f"    Sentence: {fact['sentence']}")
    print("-" * 50)  # Separator for readability


Encoding Fact-Checks: 100%|██████████| 4805/4805 [03:23<00:00, 23.60it/s]
Predicting Posts: 100%|██████████| 1/1 [00:00<00:00, 35.08it/s]

Post 0:
  Rank 1:
    Fact-check ID: 137588
    Sentence: the sky of china is red in a strange phenomenon
  Rank 2:
    Fact-check ID: 59015
    Sentence: images of red sky in china are real.
  Rank 3:
    Fact-check ID: 139338
    Sentence: picture of the sky in china turning red.
  Rank 4:
    Fact-check ID: 110349
    Sentence: the sky turned blood red in china
  Rank 5:
    Fact-check ID: 67110
    Sentence: water in a river in eastern china turned red in february 2020
--------------------------------------------------





## Pre-trained COlbert 

In [26]:
from transformers import AutoTokenizer,AutoModel, PreTrainedModel,PretrainedConfig
from typing import Dict
import torch

class ColBERTConfig(PretrainedConfig):
    model_type = "ColBERT"
    bert_model: str
    compression_dim: int = 768
    dropout: float = 0.0
    return_vecs: bool = False
    trainable: bool = True

class ColBERT(PreTrainedModel):
    """
    ColBERT model from: https://arxiv.org/pdf/2004.12832.pdf
    We use a dot-product instead of cosine per term (slightly better)
    """
    config_class = ColBERTConfig
    base_model_prefix = "bert_model"

    def __init__(self,
                 cfg) -> None:
        super().__init__(cfg)
        
        self.bert_model = AutoModel.from_pretrained(cfg.bert_model)

        for p in self.bert_model.parameters():
            p.requires_grad = cfg.trainable
        #if document_embeddings.pkl exists, load ii in self in self.document_embeddings
        if hasattr(self, 'document_embeddings'):
            self.document_vecs = torch.load('document_embeddings.pkl')
        

        self.compressor = torch.nn.Linear(self.bert_model.config.hidden_size, cfg.compression_dim)

    def forward(self,
                query: Dict[str, torch.LongTensor],
                document: Dict[str, torch.LongTensor],fact_check_id):

        query_vecs = self.forward_representation(query)
        # if documenT embedding of the fact_check_id exists, use it, else compute it
        if(self.document_vecs is not None and fact_check_id in self.document_vecs):
            document_vecs = self.document_vecs[fact_check_id]
        else:
            document_vecs = self.forward_representation(document)

        score = self.forward_aggregation(query_vecs,document_vecs,query["attention_mask"],document["attention_mask"])
        return score

    def forward_representation(self,
                               tokens,
                               sequence_type=None) -> torch.Tensor:
        
        vecs = self.bert_model(**tokens)[0] # assuming a distilbert model here
        vecs = self.compressor(vecs)
        # truncate the sequence to the maximum length the model was trained on
        # if encoding only, zero-out the mask values so we can compress storage
        if sequence_type == "doc_encode" or sequence_type == "query_encode": 
            vecs = vecs * tokens["tokens"]["mask"].unsqueeze(-1)

        return vecs

    def forward_aggregation(self,query_vecs, document_vecs,query_mask,document_mask):
        
        # create initial term-x-term scores (dot-product)
        score = torch.bmm(query_vecs, document_vecs.transpose(2,1))

        # mask out padding on the doc dimension (mask by -1000, because max should not select those, setting it to 0 might select them)
        exp_mask = document_mask.bool().unsqueeze(1).expand(-1,score.shape[1],-1)
        score[~exp_mask] = - 10000

        # max pooling over document dimension
        score = score.max(-1).values

        # mask out paddding query values
        score[~(query_mask.bool())] = 0

        # sum over query values
        score = score.sum(-1)

        return score

#
# init the model & tokenizer (using the distilbert tokenizer)
#
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") # honestly not sure if that is the best way to go, but it works :)
model = ColBERT.from_pretrained("sebastian-hofstaetter/colbert-distilbert-margin_mse-T2-msmarco")



In [None]:
#store document embeddings in a dictionary with key as post_id and value as the document embedding  store in a file
import pickle

document_embeddings = {}
def embed_docs_and_append(text, post_id,filename):
    document_input = tokenizer(text,truncation=True, max_length=512,return_tensors="pt")
    #truncate the input to 512 tokens
    document_vecs = model.forward_representation(document_input)

    with open(filename, 'ab') as f:
        pickle.dump({post_id:document_vecs}, f)

i=0
print(len(df_fact_checks_))
for _, claim in df_fact_checks_.iterrows():
    embed_docs_and_append(claim['claim'], claim['fact_check_id'],'document_embeddings.pkl')
    # print(i)
    # i+=1
import pickle
with open('document_embeddings.pkl', 'wb') as f:
    pickle.dump(document_embeddings, f)



In [25]:
# df_posts__validate_short = df_posts__validate[:2]

In [5]:
import pandas as pd
import torch
import os
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer

# nested for loop for every query and every document
answer_dict = {}
for _, query in df_posts__validate.iterrows():
    query_text = query['data']
    # print(query_text)
    query_input = tokenizer(query_text)
    query_input.input_ids += [103] * 8
    query_input.attention_mask += [1] * 8
    query_input["input_ids"] = torch.LongTensor(query_input.input_ids).unsqueeze(0)
    query_input["attention_mask"] = torch.LongTensor(query_input.attention_mask).unsqueeze(0)

    scores = []
    for _, document in df_fact_checks_.iterrows():
        
        document_text = document['claim']
        # print(document_text)
        document_input = tokenizer(document_text,return_tensors="pt")
        # get the score
        score = model.forward(query_input, document_input).squeeze(0)
        scores.append((document['fact_check_id'], float(score)))

        # print(score)
    #give dictionary of key query_id and mapping to a list of top 10 document_id
    top_10 = sorted(scores, key=lambda x: x[1], reverse=True)[:10]
    answer_dict[query['post_id']] = [fact_check_id for fact_check_id, _ in top_10]

save_path = 'answer_dict.json'
with open(save_path, 'w') as f:
    json.dump(answer_dict, f)

# check sucess @10 for the validation set
def success_at_10(answer_dict, df_posts__validate):
    success = 0
    for query_id, top_10 in answer_dict.items():
            if df_posts__validate.loc["post_id"]['fact_check_id'] in top_10:
                success += 1
    return success/len(df_posts__validate)

ans=success_at_10(answer_dict, df_posts__validate)
print(ans)


0.64234534
