# Task 1: Method 2: Searching Similar Claim by Doc2Vec Vectorization

In [None]:
import pandas as pd
import numpy as np
import nltk
import json
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

## Prepare Data

In [None]:
with open("data_raw/train-claims.json", 'r', encoding='utf8') as data:
    tran_data = json.load(data)
with open("data_raw/dev-claims.json", 'r', encoding='utf8') as data:
    deva_data = json.load(data)
with open("data_raw/test-claims-unlabelled.json", 'r', encoding='utf8') as data:
    test_data = json.load(data)

test_df = pd.read_csv("data_processed/test_df_t1.csv")
tran_df = pd.read_csv("data_processed/tran_df_t1.csv")
deva_df = pd.read_csv("data_processed/deva_df_t1.csv")
evdn_df = pd.read_csv("data_processed/evdn_full_df_t1.csv")

test_text = list(test_df["claim"])
tran_text = list(tran_df["claim"])
deva_text = list(deva_df["claim"])
evdn_text = list(evdn_df["evidence"])

test_id = list(test_df["claim_index"])
tran_id = list(tran_df["claim_index"])
deva_id = list(deva_df["claim_index"])
evdn_id = list(evdn_df["evdn_index"])

tran_deva_text = tran_text + deva_text
tran_deva_id = tran_id + deva_id

sentences = evdn_text + test_text + deva_text + tran_text
tokenized_sent = []
for s in list(sentences):
    tokenized_sent.append(word_tokenize(s.lower()))
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]

## Train / Load Doc2Vec Model

In [None]:
#model = Doc2Vec(tagged_data, vector_size = 512, window = 2, min_count = 1, epochs = 100, workers = 1)
#model.save("doc2vec.model")

model = Doc2Vec.load('doc2vec.model')

In [None]:
def cosine(s1, s2):
    return cosine_similarity(model.infer_vector(word_tokenize(s1)).reshape(1, -1), model.infer_vector(word_tokenize(s2)).reshape(1, -1))[0][0]

## Evidence Retrieval in Development Dataset

### Find the most similar claim from training dataset by cosine distance of dec2vec embedding 

In [None]:
pred = []

for i in range(len(deva_text)):
    print(i)
    evdn_pred = []
    for j in range(len(tran_text)):
        score = cosine(deva_text[i], tran_text[j])
        evdn_pred.append([tran_deva_id[j], score])
    most_match_claim = sorted(evdn_pred, key=(lambda x:x[1]), reverse=True)[0][0]
    pred.append(most_match_claim)

### Use the evidence of most similar claim

In [None]:
index_list = []
evidence_list = []
for key, value in tran_data.items():
    index_list.append(int(key.split("-")[1]))
    evidence_list.append(value["evidences"])

In [None]:
deva_data_index_list = []
for i in deva_data.keys():
    deva_data_index_list.append(int(i.split("-")[1]))

evdn_pred_list = []
for i in range(len(deva_data)):
    evidence_list_new = evidence_list[index_list.index(pred[i])]
    evidence_list_cleaned = []
    for evidence in evidence_list_new:
        evidence_list_cleaned.append(int(evidence.split("-")[1]))
    evdn_pred_list.append(evidence_list_cleaned)

In [None]:
evdn_pred_list_new = []
evdn_pred_text = []
for item in evdn_pred_list:
    temp_list1 = []
    temp_list2 = []
    for value_list in item:
        index = value_list
        text = evdn_text[index]
        temp_list1.append(str(index))
        temp_list2.append(str(text))
    evdn_pred_list_new.append(",".join(temp_list1))
    evdn_pred_text.append(" ".join(temp_list2))

result_list = []
for i in range(len(evdn_pred_text)):
    result_list.append([evdn_pred_list_new[i], evdn_pred_text[i]])

In [None]:
result_dataframe = pd.DataFrame(result_list)
result_dataframe.columns = ["evidence_id", "evidence_text"]
result_dataframe.to_csv("evdn_pred/test_evdn_pred_doc2vec.csv")

## Evidence Retrieval in Testing Dataset

### Find the most similar claim from training dataset by cosine distance of dec2vec embedding 

In [None]:
pred = []

for i in range(len(test_text)):
    evdn_pred = []
    for j in range(len(tran_deva_text)):
        score = cosine(test_text[i], tran_deva_text[j])
        evdn_pred.append([tran_deva_id[j], score])
    most_match_claim = sorted(evdn_pred, key=(lambda x:x[1]), reverse=True)[0][0]
    pred.append(most_match_claim)

### Use the evidence of most similar claim

In [None]:
index_list = []
evidence_list = []
for key, value in tran_data.items():
    index_list.append(int(key.split("-")[1]))
    evidence_list.append(value["evidences"])
for key, value in deva_data.items():
    index_list.append(int(key.split("-")[1]))
    evidence_list.append(value["evidences"])

In [None]:
test_data_index_list = []
for i in test_data.keys():
    test_data_index_list.append(int(i.split("-")[1]))

evdn_pred_list = []
for i in range(len(test_data)):
    evidence_list_new = evidence_list[index_list.index(pred[i])]
    evidence_list_cleaned = []
    for evidence in evidence_list_new:
        evidence_list_cleaned.append(int(evidence.split("-")[1]))
    evdn_pred_list.append(evidence_list_cleaned)

In [None]:
evdn_pred_list_new = []
evdn_pred_text = []
for item in evdn_pred_list:
    temp_list1 = []
    temp_list2 = []
    for value_list in item:
        index = value_list
        text = evdn_text[index]
        temp_list1.append(str(index))
        temp_list2.append(str(text))
    evdn_pred_list_new.append(",".join(temp_list1))
    evdn_pred_text.append(" ".join(temp_list2))

result_list = []
for i in range(len(evdn_pred_text)):
    result_list.append([evdn_pred_list_new[i], evdn_pred_text[i]])

In [None]:
result_dataframe = pd.DataFrame(result_list)
result_dataframe.columns = ["evidence_id", "evidence_text"]
result_dataframe.to_csv("evdn_pred/test_evdn_pred_doc2vec.csv")