In [None]:
import pandas as pd
import torch
from infersent_model import InferSent
import nltk
import numpy as np
nltk.download('punkt')

# Loading Infersent sentence encoder
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load('encoder/infersent1.pkl'))
infersent.set_w2v_path('GloVe/glove.840B.300d.txt')
infersent.build_vocab_k_words(K=100)

def parse_claim_stance():
    df = pd.read_csv("data/IBM_Claim_Stance/claim_stance_dataset_v1.csv", header=0, sep=',')
    print(df.shape[0])
    return df[['split', 'claims.claimCorrectedText', 'claims.stance', 'topicText']]


def measure_similarity_infersent():
    # Gathering sentences for encoding (the claims in the
    # 'train split' and all the principled arguments)
    dataframe = parse_claim_stance()
    data_train = dataframe[dataframe['split'] == 'train']
    claims_train = data_train['claims.claimCorrectedText']

    CoPAs = pd.read_pickle(r'data/principle_argument_CoPA/PA_list.pkl')
    sentences = claims_train.values.tolist()
    sentences.extend(CoPAs)
    infersent.update_vocab(sentences)

    # Embedding all CoPAs
    CoPAs_embedding = []
    for arg in CoPAs:
        CoPAs_embedding.append(infersent.encode(arg)[0])
    similarity_scores = []
    for index, row in dataframe.iterrows():
        claim_embedding = infersent.encode(row['claims.claimCorrectedText'])[0]
        similarity = []
        for arg in CoPAs_embedding:
            similarity.append(cosine(claim_embedding, arg))
        similarity_scores.append(similarity)

    dataframe['similarity'] = pd.Series(similarity_scores)
    return dataframe


def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))


if __name__ == '__main__':
    measure_similarity_infersent()