In [1]:
# python -m spacy download en_core_web_sm
import numpy as np
import pandas as pd
import spacy
import pickle
import torch
from sentence_transformers import SentenceTransformer, util

nlp = spacy.load('en_core_web_sm')
# from nltk import sent_tokenize
nlp.add_pipe('sentencizer')
nlp.disable_pipe("tagger")
nlp.disable_pipe("ner")
nlp.disable_pipe("attribute_ruler")
nlp.disable_pipe("lemmatizer")

model = SentenceTransformer('all-MiniLM-L6-v2')


def save_to_pkl(path, torch_embs):
    with open(path, 'wb') as pkl:
        pickle.dump(torch_embs, pkl)


def save_to_npy(path, npy_embs):
    np.save(path, npy_embs)


def load_from_pkl(path):
    with open(path, 'rb') as pkl:
        embs = pickle.load(pkl)
    return embs


def load_from_npy(path):
    return np.load(path)


# sents = sent_tokenize(df_train.iloc[0]['document']) # nltk misses a lot of sentences if space does not occur after punctuation
def sent_splitter(inp_df):
    claim_id = 1
    doc_id = 1
    df_sents = pd.DataFrame()
    for ind, row in inp_df.iterrows():
        print(ind)
        doc = nlp(row['document'])
        for ind_sent, sentence in enumerate(doc.sents):
            temp = row.copy()
            temp['claim_id'] = claim_id
            temp['doc_id'] = doc_id
            temp['document'] = sentence.text
            new_df = pd.DataFrame([temp])
            df_sents = pd.concat([df_sents, new_df], axis=0, ignore_index=True)
            doc_id += 1
        claim_id += 1
    df_sents.reset_index(drop=True, inplace=True)
    # del df_val_sents_100['Unnamed: 0']
    # len(df)
    # print(df_sents.head())
    return df_sents


def generate_embeddings(df_sents, embedding_type='tensor', save_emb_name="val"):
    claims_l = df_sents['claim'].to_list()
    print(len(claims_l))
    docs_l = df_sents['document'].to_list()
    print(len(docs_l))
    if embedding_type == 'tensor':
        claim_embeddings = model.encode(claims_l, convert_to_tensor=True)
        doc_embeddings = model.encode(docs_l, convert_to_tensor=True)
        save_to_pkl(save_emb_name + '_claim_text_sbert_embedding.pickle', claim_embeddings)
        save_to_pkl(save_emb_name + '_doc_evid_text_sbert_embedding.pickle', doc_embeddings)

    else:  # npy
        claim_embeddings = model.encode(claims_l)
        doc_embeddings = model.encode(docs_l)
        save_to_npy(save_emb_name + '_claim_text_sbert_embedding.npy', claim_embeddings)
        save_to_npy(save_emb_name + '_doc_evid_text_sbert_embedding.npy', doc_embeddings)

    return claim_embeddings, doc_embeddings


def load_embeddings(embedding_type='tensor', load_emb_name="val"):
    if embedding_type == 'tensor':
        claim_embeddings = load_from_pkl(load_emb_name + '_claim_text_sbert_embedding.pickle')
        doc_embeddings = load_from_pkl(load_emb_name + '_doc_evid_text_sbert_embedding.pickle')
    else:
        claim_embeddings = load_from_npy(load_emb_name + '_claim_text_sbert_embedding.npy')
        doc_embeddings = load_from_npy(load_emb_name + '_doc_evid_text_sbert_embedding.npy')
    return claim_embeddings, doc_embeddings


def calc_sim(claim, doc_embs, encode_claim=False,
             embedding_type='tensor'):  # Can pass a single claim embedding loaded from npy/pkl or single claim text
    if encode_claim == True:
        if embedding_type == 'tensor':
            claim_emb = model.encode(claim, convert_to_tensor=True)
        else:
            claim_emb = model.encode(claim)
    else:
        claim_emb = claim

    cosine_scores = util.cos_sim(claim_emb, doc_embs)
    # sorted_res = sorted(range(len(cosine_scores[0].tolist())), key=lambda k: cosine_scores[k], reverse=True)[:TOP_K]
    # sem_sim = util.semantic_search(claim_embeddings[45], doc_embeddings, top_k=TOP_K)
    top_results = torch.topk(cosine_scores[0], k=TOP_K)
    return top_results.indices.tolist()


# GENERATE_SENTS = False
# GENERATE_EMBS = False
# SENTS_FILE_PATH = "df_val_sents.csv"
# TOP_K = 25

# if __name__ == "__main__":

#     if GENERATE_SENTS:
#         # df_train = pd.read_csv("train.csv", delimiter='\t', encoding='utf-8')
#         # print(len(df_train))
#         df_val = pd.read_csv("val.csv", delimiter='\t', encoding='utf-8')
#         print(len(df_val))
#         df_sent = sent_splitter(df_val)
#         df_sent.to_csv(SENTS_FILE_PATH)
#     else:
#         df_sent = pd.read_csv(SENTS_FILE_PATH)

#     if GENERATE_EMBS:
#         claim_embeddings, doc_embeddings = generate_embeddings(df_sent, embedding_type='npy', save_emb_name="val")
#     else:
#         claim_embeddings, doc_embeddings = load_embeddings(embedding_type='tensor', load_emb_name="val")

#     sorted_ind = calc_sim(claim_embeddings[0], doc_embeddings,
#                           encode_claim=False)  # Can pass a single claim embedding loaded from npy/pkl or single claim text
#     ranked_evids_df = df_sent[sorted_ind]
#     ranked_evids_df.to_csv("ranked_evids.csv")


  from .autonotebook import tqdm as notebook_tqdm


ValueError: [E003] Not a valid pipeline component. Expected callable, but got 'sentencizer' (name: 'None').[E004] If you meant to add a built-in component, use `create_pipe`: `nlp.add_pipe(nlp.create_pipe('sentencizer'))`

In [4]:
print(spacy.__version__)

2.3.1
