In [1]:
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [2]:
qns_answers = pd.read_csv('data/updated_qns_answers.csv')

In [3]:
semantic_matching_dataset = qns_answers.copy()

semantic_matching_dataset.head()

Unnamed: 0,question,points,article,nlp_analysis,readability_score,preprocessed_question,id,url,article_text,preprocessed_text,cluster
0,did the people of gibraltar vote to remain pa...,58,Gibraltar,"([('the united kingdom', 'GPE'), ('2002', 'DAT...",62.68,people gibraltar vote remain part united kingd...,15222,https://simple.wikipedia.org/wiki/Gibraltar,Gibraltar is an Overseas Territory of the Unit...,gibraltar overseas territory united kingdom me...,192.0
1,which country uses the franc as its official ...,55,Currency,"([], [(' ', 'dep', 'which'), ('which', 'det', ...",53.88,country us franc official currency,2140,https://simple.wikipedia.org/wiki/Currency,Currency is the unit of money used by the peop...,currency unit money used people country union ...,127.0
2,which of these old communist parties no longe...,52,List of communist parties,"([('communist', 'NORP'), ('today', 'DATE')], [...",69.79,old communist party longer exists today,4402,https://simple.wikipedia.org/wiki/List%20of%20...,There are a number of communist parties around...,number communist party around world world hist...,121.0
3,a patient has a terminal illness and wants to ...,65,Medical ethics,"([], [('a', 'det', 'patient'), ('patient', 'ns...",66.23,patient terminal illness want end life family ...,13938,https://simple.wikipedia.org/wiki/Medical%20et...,Medical ethics is the set of ethical rules tha...,medical ethic set ethical rule doctor follow i...,117.0
4,"according to plato, what are the three types o...",55,The Republic,"([('three', 'CARDINAL')], [('according', 'prep...",71.14,according plato three type people society made,13148,https://simple.wikipedia.org/wiki/The%20Republic,The Republic is a book by Plato. It was finish...,republic book plato finished bc asks question ...,117.0


In [4]:
# Load SpaCy model
nlp = spacy.load('en_core_web_md', disable=['parser', 'tagger', 'ner'])

def get_spacy_vectors(texts, batch_size=100):
    vectors = []
    for doc in nlp.pipe(texts, batch_size=batch_size):
        vectors.append(doc.vector)
    return np.array(vectors)

# Ensure all text data is string and replace np.nan values with empty strings
semantic_matching_dataset['preprocessed_question'] = semantic_matching_dataset['preprocessed_question'].fillna('').astype(str)
semantic_matching_dataset['preprocessed_text'] = semantic_matching_dataset['preprocessed_text'].fillna('').astype(str)

# Aggregate the preprocessed text of articles by their cluster to create a single document per cluster
cluster_documents = semantic_matching_dataset.groupby('cluster')['preprocessed_text'].apply(lambda texts: ' '.join(texts)).reset_index(name='cluster_document')

# Merge the aggregated cluster document back into the main DataFrame
semantic_matching_dataset = pd.merge(semantic_matching_dataset, cluster_documents, on='cluster', how='left')

# Generate SpaCy vectors for questions
questions = semantic_matching_dataset['preprocessed_question'].tolist()
question_vectors_spacy = get_spacy_vectors(questions)

# Generate SpaCy vectors for the corresponding cluster documents of each question
# Ensure the order of cluster documents matches the order of questions
cluster_document_vectors_spacy = get_spacy_vectors(semantic_matching_dataset['cluster_document'].tolist())



In [None]:

# Calculate cosine similarities between each question vector and its corresponding cluster document vector
cosine_similarities_spacy = cosine_similarity(question_vectors_spacy, cluster_document_vectors_spacy)

# Since each question is compared with its corresponding cluster document, we can directly use the diagonal of the similarity matrix
cosine_sim_scores = np.diag(cosine_similarities_spacy)
semantic_matching_dataset['cosine_sim_with_cluster_spacy'] = cosine_sim_scores

# Now, semantic_matching_dataset contains a 'cosine_sim_with_cluster_spacy' column with the cosine similarity of each question to its associated cluster's aggregated content


In [None]:
semantic_matching_dataset.head()

we can explore further by finding which cluster has the highest cosine similarity for every question to refine the clustering model

In [None]:
sbert_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

def encode_texts_in_batches(texts, model, batch_size=128):
    vectors = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        # ensure the model's output is converted to tensors and moved to CPU before converting to numpy
        batch_vectors = model.encode(batch, convert_to_tensor=True).cpu().numpy()
        vectors.extend(batch_vectors)
    return np.array(vectors)

question_vectors_sbert = encode_texts_in_batches(
    semantic_matching_dataset['question'].tolist(), sbert_model)
article_vectors_sbert = encode_texts_in_batches(
    semantic_matching_dataset['text'].tolist(), sbert_model)

cosine_sim_scores = [cosine_similarity([q_vec], [a_vec])[0][0] for q_vec, a_vec in zip(question_vectors_sbert, article_vectors_sbert)]
semantic_matching_dataset['cosine_sim_sbert'] = cosine_sim_scores


In [None]:
# may need to delete cached models before running use model
# !rm -rf /var/folders/j3/2xy_ffxd6jq618phh49pk1b00000gn/T/tfhub_modules/*

use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')


def batch_get_use_vector(texts, batch_size=32):
    vectors = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_vectors = use_model(batch_texts).numpy()
        vectors.append(batch_vectors)
    return np.vstack(vectors)

questions = semantic_matching_dataset['question'].tolist()
articles = semantic_matching_dataset['text'].tolist()

semantic_matching_dataset['question_vector_use'] = list(map(list, batch_get_use_vector(questions)))
semantic_matching_dataset['article_vector_use'] = list(map(list, batch_get_use_vector(articles)))

cosine_sims = np.array([
    cosine_similarity([q_vec], [a_vec])[0][0] 
    for q_vec, a_vec in zip(semantic_matching_dataset['question_vector_use'], semantic_matching_dataset['article_vector_use'])
])

semantic_matching_dataset['cosine_sim_use'] = cosine_sims

In [None]:
semantic_matching_dataset.head()

In [None]:
mean_sim_spacy = np.mean(semantic_matching_dataset['cosine_sim_spacy'])
mean_sim_sbert = np.mean(semantic_matching_dataset['cosine_sim_sbert'])
mean_sim_use = np.mean(semantic_matching_dataset['cosine_sim_use'])

print(f"Mean Cosine Similarity (SpaCy): {mean_sim_spacy}")
print(f"Mean Cosine Similarity (SBERT): {mean_sim_sbert}")
print(f"Mean Cosine Similarity (USE): {mean_sim_use}")

spacy embedding model clearly outperforms

In [None]:
plt.scatter(semantic_matching_dataset['points'], semantic_matching_dataset['cosine_sim_spacy'], label='SpaCy', alpha=0.5)
plt.scatter(semantic_matching_dataset['points'], semantic_matching_dataset['cosine_sim_sbert'], label='SBERT', alpha=0.5)
plt.scatter(semantic_matching_dataset['points'], semantic_matching_dataset['cosine_sim_use'], label='USE', alpha=0.5)

plt.xlabel('Points')
plt.ylabel('Cosine Similarity')
plt.title('Correlation between Points and Cosine Similarity')
plt.legend()
plt.show()

seems that spacy model captures a good portion of the lower scored questions. let's explore the question-article pairs that scored a near perfect 1.00 similarity score, as well as the opposite

In [None]:
semantic_matching_dataset[semantic_matching_dataset['cosine_sim_spacy'] >= 0.9]


Direct Question-Answer Relation: Each question directly relates to the topic of the article. The keywords from the questions are explicitly present in the text of the articles, leading to a higher similarity score.

Specificity: The questions are specific, and the articles contain detailed information that directly answers these questions. This specificity likely results in a higher concentration of relevant terms and less noise, which boosts similarity scores.

Relevance of Content: The content of the questions closely aligns with the main subject of the articles. For example, a question about the environment directly relates to an article titled "Environment," and a question about MRI accuracy directly relates to an article about "Magnetic resonance imaging."

High readability scores as well

In [None]:
semantic_matching_dataset[semantic_matching_dataset['cosine_sim_spacy'] <= 0.0]

Mismatch of Context: The questions are about specific events or people, while the articles titled with years likely cover a wide range of events that happened in that year. The spaCy embeddings might not find a strong contextual match between the question's focus and the broad content of the article.

Lack of Specificity in Articles: Articles that are simply titled with a year might not have a strong thematic focus, leading to a diluted set of vectors that don't match well with the more focused vectors generated from the questions.

Semantic Ambiguity: Years by themselves do not carry specific semantic information. Without additional context, the model may struggle to link a question about a specific event or individual to an article that broadly covers everything related to that year.

Named Entity Recognition (NER) Challenges: If the model is heavily weighing named entities, such as specific names and dates, it may not correctly associate the relevance of the year to the specific event or individual in the question.

Temporal Relevance: Questions that ask about a particular date or event may not align well with the content of an article that summarizes an entire year. The temporal focus is too broad in the article to match the specificity of the question.

given that we are not able to modify the training and test data set, fine tuning the data for the spacy model would hence be quite challenging. we could instead look for alternative models that are specifically designed for chatbots (most common use case of question-answer)

In [None]:
# Ensure the TF-IDF transformation for articles is performed
tfidf_matrix_articles_individual = tfidf_vectorizer.transform(qns_answers['preprocessed_text'])
