In [24]:
#package 
import pandas as pd
import random

In [25]:
def generate_word_intrusion(df, num_real_words=4):
  
    all_intrusion_sets = []
    for index, topic_row in df.iterrows():
        current_topic_id = topic_row["Topic_ID"]
        current_topic_name = topic_row["Topic_Name"]
        top_words_from_current_topic = []
        word_score_pairs = topic_row["Top_Words"].split(', ')
        for i in range(min(num_real_words, len(word_score_pairs))):
            word_with_score = word_score_pairs[i]
            just_the_word = word_with_score.split('(')[0]
            top_words_from_current_topic.append(just_the_word)
        other_topics_df = df[df['Topic_ID'] != current_topic_id]
        if other_topics_df.empty:
            print(f"No other topics found to select an intruder for Topic ID: {current_topic_id}. Skipping.")
            continue 
        random_other_topic = random.choice(other_topics_df["Top_Words"].tolist())
        intruder_word_with_score = random.choice(random_other_topic.split(', '))
        intruder_word = intruder_word_with_score.split('(')[0]
        all_words_for_set = top_words_from_current_topic + [intruder_word]
        random.shuffle(all_words_for_set)
        intrusion_set_data = {
            'Topic_ID': current_topic_id,
            'Topic_Name': current_topic_name,
            'Words_with_Intruder_Shuffled': all_words_for_set,
            'Intruder': intruder_word
        }
        all_intrusion_sets.append(intrusion_set_data)
    word_intrusion = pd.DataFrame(all_intrusion_sets)
    return word_intrusion

In [28]:
#doc_intrusion
def generate_document_intrusion(document_probabilities_df, topic_statistics_df, num_relevant_documents=4):
    
    all_intrusion_sets = []

    topic_probability_columns = [col for col in document_probabilities_df.columns if col.startswith("Topic_")]

    for topic_column_name in topic_probability_columns:
        current_topic_id = int(topic_column_name.split('_')[1])
        current_topic_name = topic_statistics_df.loc[
            topic_statistics_df["Topic_ID"] == current_topic_id, "Topic_Name"].values[0]
        most_relevant_documents = document_probabilities_df.nlargest(
            num_relevant_documents, topic_column_name
        )[["Document", topic_column_name]]
        intruder_candidates_pool = document_probabilities_df.nsmallest(20, topic_column_name  )
        chosen_intruder_document = intruder_candidates_pool.sample(1)["Document"].values[0]
        list_of_relevant_documents = most_relevant_documents["Document"].tolist()
        all_documents_for_set = list_of_relevant_documents + [chosen_intruder_document]
        random.shuffle(all_documents_for_set)
        intrusion_set_data = {
            'Topic_ID': current_topic_id,
            'Topic_Name': current_topic_name,
            'Documents_with_Intruder_Shuffled': all_documents_for_set,
            'Intruder': chosen_intruder_document
        }
        all_intrusion_sets.append(intrusion_set_data)

    doc_intrusion = pd.DataFrame(all_intrusion_sets)
    return doc_intrusion

In [29]:
topics_df = pd.read_csv("csv/aita_topics_stats.csv")  
topic_probs_df = pd.read_csv("csv/aita_full_post_topic_probabilities.csv") 

In [30]:
word_intrusion_df = generate_word_intrusion(topics_df)
doc_intrusion_df = generate_document_intrusion(topic_probs_df, topics_df)