In [2]:
# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Load Data

In [3]:
train = pd.read_csv(r'/kaggle/input/ioai-contest-3/train.csv')
test = pd.read_csv(r'/kaggle/input/ioai-contest-3/test.csv')
extra = pd.read_csv(r'/kaggle/input/ioai-contest-3/extra_df.csv')
sab = pd.read_csv(r'/kaggle/input/ioai-contest-3/sample_submission.csv')

In [5]:
extra = extra.drop_duplicates()

In [5]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_context(model, df, extra, topn=1):
    questions = ["query: " + q for q in df['questions'].tolist()]
    passages = ["passage: " + t for t in extra['fact'].tolist()]
    
    question_embeds = model.encode(questions, convert_to_numpy=True, normalize_embeddings=True)
    passage_embeds = model.encode(passages, convert_to_numpy=True, normalize_embeddings=True)

    sim_matrix = cosine_similarity(question_embeds, passage_embeds)

    topn_indices = np.argsort(-sim_matrix, axis=1)[:, :topn]

    combined_contexts = []
    similarities = []
    
    fact_lists = []
    source_lists = []
    category_lists = []
    url_lists = []
    author_lists = []
    title_lists = []
    
    for i in range(len(df)):
        indices = topn_indices[i]
        topn_facts = extra.iloc[indices]['fact'].tolist()
        sources = extra.iloc[indices]['source'].tolist()
        categories = extra.iloc[indices]['category'].tolist()
        urls = extra.iloc[indices]['url'].tolist()
        authors = extra.iloc[indices]['author'].tolist()
        titles = extra.iloc[indices]['title'].tolist()

        combined_context = " ".join(topn_facts)
        combined_contexts.append(combined_context)
        
        max_sim = sim_matrix[i, indices[0]]
        similarities.append(max_sim)
        
        # Append lists of extra fields
        fact_lists.append(topn_facts)
        source_lists.append(sources)
        category_lists.append(categories)
        url_lists.append(urls)
        author_lists.append(authors)
        title_lists.append(titles)

    # Add the new columns to the dataframe
    #df['combined_fact'] = combined_contexts
    df['fact_list'] = fact_lists
    df['source_list'] = source_lists
    df['category_list'] = category_lists
    df['url_list'] = url_lists
    df['author_list'] = author_lists
    df['title_list'] = title_lists
    df['similarity'] = similarities

    return df


def get_questions(model, train, test, topn=1):
    # Prepare queries and passages
    questions = ["query: " + q for q in test['questions'].tolist()]
    passages = ["passage: " + t for t in train['questions'].tolist()]
    
    question_embeds = model.encode(questions, convert_to_numpy=True, normalize_embeddings=True)
    passage_embeds = model.encode(passages, convert_to_numpy=True, normalize_embeddings=True)

    sim_matrix = cosine_similarity(question_embeds, passage_embeds)

    topn_indices = np.argsort(-sim_matrix, axis=1)[:, :topn + 1]

    questions_contexts = []
    targets_contexts = []
    
    for i in range(len(test)):
        indices = topn_indices[i]
        
        if train.iloc[indices[0]]['questions'] == test.iloc[i]['questions']:
            indices = indices[1:]
        else:
            indices = indices[:-1]
        
        questions_list = train.iloc[indices]['questions'].tolist()
        answers_list = train.iloc[indices]['answer'].tolist()

        questions_contexts.append(questions_list)
        targets_contexts.append(answers_list)

    test['near_questions'] = questions_contexts
    test['near_answers'] = targets_contexts

    return test


def get_chunks(model, df, extra, topn=1):
    questions = ["query: " + q for q in df['questions'].tolist()]
    passages = ["passage: " + t for t in extra['chunks'].tolist()]
    
    question_embeds = model.encode(questions, convert_to_numpy=True, normalize_embeddings=True)
    passage_embeds = model.encode(passages, convert_to_numpy=True, normalize_embeddings=True)

    sim_matrix = cosine_similarity(question_embeds, passage_embeds)

    topn_indices = np.argsort(-sim_matrix, axis=1)[:, :topn]

    list_chunks = []
    
    for i in range(len(df)):
        indices = topn_indices[i]
        topn_chunks = extra.iloc[indices]['chunks'].tolist()
        
        list_chunks.append(topn_chunks)

    df['chunks'] = list_chunks

    return df

In [6]:
model = SentenceTransformer('intfloat/e5-large-v2')

train_with_extra = get_context(model, train, extra, topn=3)
test_with_extra = get_context(model, test, extra, topn=3)
# train_with_extra = get_questions(model, train_with_extra, train_with_extra, topn=3)
# test_with_extra = get_questions(model, train_with_extra, test_with_extra, topn=3)
# train_with_chunks = get_chunks(model, train, url_chunks, topn=3)
# test_with_chunks = get_chunks(model, test, url_chunks, topn=3)'
'

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Batches:   0%|          | 0/62 [00:00<?, ?it/s]

Batches:   0%|          | 0/1500 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Batches:   0%|          | 0/1500 [00:00<?, ?it/s]

In [8]:
train_with_chunks.to_csv('train_with_extra.csv', index=False)
test_with_chunks.to_csv('test_with_extra.csv', index=False)