# Task 1: Method 1: Searching Related Evidence by TFIDF Vectorization

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

## Prepare Data

In [None]:
test_df = pd.read_csv("data_processed/test_df_t1.csv")
tran_df = pd.read_csv("data_processed/tran_df_t1.csv")
deva_df = pd.read_csv("data_processed/deva_df_t1.csv")
evdn_df = pd.read_csv("data_processed/evdn_full_df_t1.csv")

test_text = list(test_df["claim"])
tran_text = list(tran_df["claim"])
deva_text = list(deva_df["claim"])
evdn_text = list(evdn_df["evidence"])

test_id = list(test_df["claim_index"])
tran_id = list(tran_df["claim_index"])
deva_id = list(deva_df["claim_index"])
evdn_id = list(evdn_df["evdn_index"])

evdn_range = [0, len(evdn_text)]
test_range = [len(evdn_text), len(evdn_text)+len(test_text)]
deva_range = [len(evdn_text)+len(test_text), len(evdn_text)+len(test_text)+len(deva_text)]
tran_range = [len(evdn_text)+len(test_text)+len(deva_text), len(evdn_text)+len(test_text)+len(deva_text)+len(tran_text)]

## Prepare TFIDF Vectors

In [None]:
sentences = evdn_text + test_text + deva_text + tran_text

count_vect = CountVectorizer(ngram_range=(2, 2))
bi_gram_counts = count_vect.fit_transform(sentences)

tfidf_transformer = TfidfTransformer()
bi_gram_tfidf = tfidf_transformer.fit_transform(bi_gram_counts)

## Evidence Retrieval in Development Dataset

In [None]:
evdn_pred_list = []

for i in range(deva_range[0], deva_range[1]):
    print(i-len(test_text)-len(evdn_text))
    evdn_pred = []
    for j in range(evdn_range[0], evdn_range[1]):
        if j % 10000 == 0:
            print(str(i-len(test_text)-len(evdn_text)) + " - " + str(j))
        evdn_pred.append([j, cosine_similarity(bi_gram_tfidf[i], bi_gram_tfidf[j])[0][0]])
    evdn_pred_list.append(sorted(evdn_pred, key=(lambda x:x[1]), reverse=True)[0:6])

evdn_pred_list_new = []
evdn_pred_text = []
for item in evdn_pred_list:
    temp_list1 = []
    temp_list2 = []
    for value_list in item:
        index = value_list[0]
        text = evdn_text[index]
        temp_list1.append(str(index))
        temp_list2.append(str(text))
    evdn_pred_list_new.append(",".join(temp_list1))
    evdn_pred_text.append(" ".join(temp_list2))

result_list = []
for i in range(len(evdn_pred_text)):
    result_list.append([evdn_pred_list_new[i], evdn_pred_text[i]])

In [None]:
result_dataframe = pd.DataFrame(result_list)
result_dataframe.columns = ["evidence_id", "evidence_text"]
result_dataframe.to_csv("evdn_pred/deva_evdn_pred_tfidf.csv")

## Evidence Retrieval in Testing Dataset

In [None]:
evdn_pred_list = []

for i in range(test_range[0], test_range[1]):
    print(i-len(evdn_text))
    evdn_pred = []
    for j in range(evdn_range[0], evdn_range[1]):
        if j % 10000 == 0:
            print(str(i-len(evdn_text)) + " - " + str(j))
        evdn_pred.append([j, cosine_similarity(bi_gram_tfidf[i], bi_gram_tfidf[j])[0][0]])
    evdn_pred_list.append(sorted(evdn_pred, key=(lambda x:x[1]), reverse=True)[0:6])

evdn_pred_list_new = []
evdn_pred_text = []
for item in evdn_pred_list:
    temp_list1 = []
    temp_list2 = []
    for value_list in item:
        index = value_list[0]
        text = evdn_text[index]
        temp_list1.append(str(index))
        temp_list2.append(str(text))
    evdn_pred_list_new.append(",".join(temp_list1))
    evdn_pred_text.append(" ".join(temp_list2))

result_list = []
for i in range(len(evdn_pred_text)):
    result_list.append([evdn_pred_list_new[i], evdn_pred_text[i]])

In [None]:
result_dataframe = pd.DataFrame(result_list)
result_dataframe.columns = ["evidence_id", "evidence_text"]
result_dataframe.to_csv("evdn_pred/test_evdn_pred_tfidf.csv")