In [None]:
# CODE FOR TRACK 1

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# We create a unique dataframe with both the data from the train and the dev datasets
df1 = pd.read_csv('train_responses.csv', sep=',')
df2 = pd.read_csv('dev_responses.csv', sep=',')
df = pd.concat([df1, df2], ignore_index=True)
df.dropna(subset=['model_response'], inplace=True)

# We create a list with all the user prompts
questions = df.user_prompt.tolist()

# We define a function response that computes the similarity between a user prompt and the stored prompts,
# retrieves the most similar one and it returns it
def response(user_prompt):
    output = ''
    questions.append(user_prompt)
    
    TfidfVec = TfidfVectorizer(stop_words = 'english')
    tfidf = TfidfVec.fit_transform(questions)
    
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    
    output = questions[idx]
    questions.remove(user_prompt)
    return output

# We create a dataframe with the user prompts from the test dataset
df_inputs = pd.read_csv('test_prompts.csv', sep=',')
inputs = df_inputs.user_prompt.tolist()

# For each of the user prompts, we extract the most similar one using the 'response' function and we store in a 
# dictionary the conversation ids of the test prompts and the retrieved ones 
d = {}
for prompt in inputs:
    question = response(prompt)
    filtered_df = df[df['user_prompt'] == question]
    q_id = df_inputs[df_inputs['user_prompt'] == prompt]['conversation_id'].iloc[0]
    if not filtered_df.empty:
        answer = filtered_df['model_response'].iloc[0]
        a_id = df[df['model_response'] == answer]['conversation_id'].iloc[0]
        d[str(q_id)] = str(a_id)
    else:
        d[str(q_id)] = "I have not found the answer!"
        
# We create the dataset with the conversation ids of the test prompts and the retrieved ones 
df_sol = pd.DataFrame(d.items(), columns=['keys', 'values'])
df_sol.columns = ['conversation_id', 'response_id']
df_sol.to_csv('track_1_test.csv', index=False)

In [None]:
# CODE FOR TRACK 2

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from multiprocessing import cpu_count
import pandas as pd

# We create a unique dataframe with both the data from the train and the dev datasets
df1 = pd.read_csv('train_responses.csv', sep=',')
df2 = pd.read_csv('dev_responses.csv', sep=',')
df = pd.concat([df1, df2], ignore_index=True)
df.dropna(subset=['model_response'], inplace=True)
questions = df.user_prompt.tolist()
df.head()

# We define the corpus we are going to use to train the Doc2Vec model
corpus = []
for index, row in df.iterrows():
    text = row['user_prompt'].lower()
    corpus.append(TaggedDocument(words=text.split(),
                                 tags=[index]))
    
d2v_model = Doc2Vec(vector_size=100,
                    workers=cpu_count(),
                    epochs=100,
                    dm=0,
                    dbow_words=1)

# We build the vocabulary and train the model 
d2v_model.build_vocab(corpus)
d2v_model.train(corpus, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

# We create a dataframe with the user prompts from the test dataset
df_inputs = pd.read_csv('test_prompts.csv', sep=',')
inputs = df_inputs.user_prompt.tolist()

# For each of the user prompts, we extract the most similar one within the prompts in the dev and train test and we 
# store in a dictionary the conversation ids of the test prompts and the retrieved ones 
d = {}
for prompt in inputs:
    preprocessed_prompt = prompt.lower().split()
    prompt_vector = d2v_model.infer_vector(preprocessed_prompt)
    sims = d2v_model.dv.most_similar([prompt_vector])
    question = df.loc[sims[0][0], 'user_prompt']

    filtered_df = df[df['user_prompt'] == question]
    q_id = df_inputs[df_inputs['user_prompt'] == prompt]['conversation_id'].iloc[0]
    if not filtered_df.empty:
        answer = filtered_df['model_response'].iloc[0]
        a_id = df[df['model_response'] == answer]['conversation_id'].iloc[0]
        d[str(q_id)] = str(a_id)
    else:
        d[str(q_id)] = "I have not found the answer!"
        
# We create the dataset with the conversation ids of the test prompts and the retrieved ones
df_sol = pd.DataFrame(d.items(), columns=['keys', 'values'])
df_sol.columns = ['conversation_id', 'response_id']
df_sol.to_csv('track_2_test.csv', index=False)