In [36]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import numpy as np
import spacy
nlp = spacy.load("en_core_web_md")  # Load the small English model

In [2]:
# Load data 
input_file_path = '/Users/thebekhruz/Desktop/100Days-Of-Code/100-Days-of-NLP-Odyssey/data/processed/data_with_contextual_relevance_and_doc_embddings.jsonl'

try:
    df = pd.read_json(input_file_path, lines=True)
    print('Data Frame loaded as df')
except Exception as e:
    print(e)


Data Frame loaded as df


### $Score = Semantic Similarity + λ × Contextual Relevance$


In [3]:

# Universal Preprocessing step
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


In [69]:
nlp = spacy.load("en_core_web_lg")  # Load the small English model
def extract_named_entities_spacy(text):
    doc = nlp(text)
    named_entities = [ent.label_ for ent in doc.ents if ent.label_ in ['DATE', 'LOC', 'PERSON', 'ORG', 'MISC']]

    return named_entities

# Example usage
# query = "What is Queen Victoria Road in High Wycombe known for?"
query = "What historical events took place at Queen Victoria Road in High Wycombe?"

named_entities = extract_named_entities_spacy(query)
print(named_entities)
# search(query, model, k=5, weight=100)



[]


In [82]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
bert_model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [83]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

def extract_named_entities_bert(text):
    # Create a pipeline for named entity recognition
    nlp = pipeline("ner", model=bert_model, tokenizer=tokenizer, aggregation_strategy="simple")

    # Process the text
    ner_results = nlp(text)

    # Now, ner_results will include aggregated entities with 'word' and 'entity_group' keys
    entities_with_spans = [ ent['entity_group'] for ent in ner_results]

    return entities_with_spans

# Example usage
query = "What historical events took place at Queen Victoria Road in High Wycombe?"
named_entities_with_spans = extract_named_entities_bert(query)
print(named_entities_with_spans)


['LOC', 'LOC']


In [22]:
# # Given entities look for documents that have those entities and then exctract total_tfidf_scores
# def get_tf_idf_score(doc_index, named_entities):
#     # Search in mentions if document has the given entities type
#     for i in range(len(df.iloc[doc_index]['mentions'])):
#         if df.iloc[doc_index]['mentions'][i]['ne_span'] in named_entities:
#             return df.iloc[doc_index]['mentions'][i]['total_tfidf_score']

# print(get_tf_idf_score(1, ['DATE', 'PERSON']))

None


In [25]:
# Get tfidf score by entity type
def get_tf_idf_score(doc_index, named_entities_types):
    # Accesses the mentions column for doc_index
    mentions = df.iloc[doc_index]['mentions']
    # Filter mentions to only include those with the named entity types
    relevant_mentions = [mention for mention in mentions if mention['ne_type'] in named_entities_types]
    # Sum the tfidf scores for the relevant mentions
    total_score = sum([mention['total_tfidf_score'] for mention in relevant_mentions])
    # print(f"Total tfidf score for document {doc_index}: {total_score}")
    return total_score

# Test the function
get_tf_idf_score(1, ['ORG', 'PER'])
    

Total tfidf score for document 1: 4.3981177834


In [84]:
# Get tfidf score by entity type
def get_tfidf_score(doc_index, named_entities_types):
    # Accesses the mentions column for doc_index
    mentions = df.iloc[doc_index]['mentions']
    # Filter mentions to only include those with the named entity types
    relevant_mentions = [mention for mention in mentions if mention['ne_type'] in named_entities_types]
    # Sum the tfidf scores for the relevant mentions
    total_score = sum([mention['total_tfidf_score'] for mention in relevant_mentions])
    # print(f"Total tfidf score for document {doc_index}: {total_score}")
    return total_score

def search(query, model, k=5, weight=10):

    # Extract named entity types from the query
    # print(f"Query: {query}")
    named_entity_types = extract_named_entities_bert(query)

    # Preprocess and encode the query
    query = preprocess_text(query)
    query_embedding = model.encode([query])

    # Calculate semantic similarity
    semantic_similarity = cosine_similarity(query_embedding, np.array(df['text_embedding'].tolist())).flatten()
    top_k_indices = np.argsort(semantic_similarity)[-k:][::-1]

    
    # Initialize a list to store final documents with their computed scores
    final_scores = []

    for index in top_k_indices:
        # Calculate TF-IDF score based on matching named entity types
        # print(f"Document {index}:")
        # print('Named entity types:', named_entity_types)
        tf_idf_score = get_tfidf_score(index, named_entity_types)
        # print('TF-IDF score:', tf_idf_score)

        final_score = semantic_similarity[index] + weight * tf_idf_score
        final_scores.append((index, final_score))

    # Sort documents by their final score in descending order
    final_scores.sort(key=lambda x: x[1], reverse=True)

    # Print the most similar documents
    for i in range(len(final_scores)):
        print(f"Document Rank {i+1}:")
        print(df.iloc[final_scores[i][0]]['text'])
        print(f"Final score: {final_scores[i][1]}")
        print("---")
    
    # Return the top k documents based on their final scores
    # return [df.iloc[index] for index, score in final_scores[:k]]



### Testing

In [85]:
model = SentenceTransformer("msmarco-distilbert-dot-v5")


In [None]:
query = "How many employees does Google have?"
search(query, model, k=5, weight=100)

In [86]:
# General Query about a Location: 
query = "What is Queen Victoria Road in High Wycombe known for?"
search(query, model, k=5, weight=100)

Document 1:
High Wycombe Police Station and Town Hall Queen Victoria Road, High Wycombe. late 1935
Police Station and Town Hall viewed side on and commemorative bridge over River Wye on western side
Final score: 440.007296244417
---
Document 2:
High Wycombe Police Station, Queen Victoria Road, High Wycombe about 1935
New Police Station High Wycombe
Final score: 433.2550713145417
---
Document 3:
High Wycombe Police Station, in Queen Victoria Road, High Wycombe. Oct 1935
New Police Station High Wycombe viewed from opposite side of the road
Final score: 418.3740381641876
---
Document 4:
Queen Victoria Rd under construction, High Wycombe, about 1901
Construction of Queen Victoria Rd north of R. Wye
Final score: 329.20085051290584
---
Document 5:
New Police Station in Queen Victoria Road, High Wycombe in 1937
New Police Station High Wycombe
Final score: 228.8259715284566
---


In [87]:
# Specific Event or Item Query: 
query = "Information about the time capsule found in High Wycombe."
search(query, model, k=3, weight=100)
# The model does not perform that well for specific events. When considering only Semanticity and not contextual relevance. 

Document 1:
High Wycombe Police Station, Queen Victoria Road, High Wycombe about 1935
New Police Station High Wycombe
Final score: 433.1705213127112
---
Document 2:
View of S. side of Wycombe Abbey School, High Wycombe, date unknown
View of Wycombe Abbey School showing cloisters on S. side
Final score: 73.59741819301809
---
Document 3:
Postcard showing a view of the entrance hall Wycombe Abbey School, High Wycombe. date unknown
Interior of Wycombe Abbey School, entrance hall with American Plaque, Gothic style window and ceiling, table chairs and carpet, picture/hanging over fireplace
Final score: 28.803922330703692
---


In [88]:
# Cultural or Historical Site Query: 
query = "Where is the Reference Library located in High Wycombe?"
search(query, model, k=3, weight=100)



Document 1:
Reference Library door, Queen Victoria Rd, High Wycombe. about 1992
Corridor entrance to Reference Library
Final score: 269.64036306412777
---
Document 2:
Reference Library, Queen Victoria Rd, High Wycombe. 1993 to 1994
Reference Library
Final score: 179.83889007235686
---
Document 3:
Two Library staff in the Reference Library, Queen Victoria Rd, High Wycombe. about 1992
Reference Library
Final score: 152.23178482965233
---


In [89]:
# Broad Historical or Cultural Query:
query = "Historical landmarks in High Wycombe."
search(query, model, k=3, weight=100)



Document 1:
High Wycombe Police Station, Queen Victoria Road, High Wycombe about 1935
New Police Station High Wycombe
Final score: 433.20799750655937
---
Document 2:
Looking N, a view of the front of Wycombe Museum (formerly Castle Hill House), with two benches against the wall. Priory Ave, High Wycombe. early 1990's
Front of Wycombe Museum, formerly Castle Hill House, with two benches against the wall The drive runs along the right.
Final score: 397.08873355325073
---
Document 3:
Looking SE, a view of the N front of Wycombe Abbey from the drive, Abbey Grounds, High Wycombe. Circa 1895
The North front of Wycombe Abbey, from the drive
Final score: 95.95599441603447
---


In [66]:
query = "Are there any annual cultural festivities taking place at Queen Victoria Road, High Wycombe?"
search(query, model, k=5, weight=100)


Document 1:
Many guests dining at an Official reception shortly after the opening of the new Town Hall, Queen Victoria Rd, High Wycombe. Nov 1904
Interior Town Hall
Final score: 0.8590923183311419
---
Document 2:
Opening ceremony of the Library, Queen Victoria Rd. High Wycombe, 25 June 1932
Outside new Library
Final score: 0.8535580246911669
---
Document 3:
High Wycombe Police Station and Town Hall Queen Victoria Road, High Wycombe. late 1935
Police Station and Town Hall viewed side on and commemorative bridge over River Wye on western side
Final score: 0.8445607071278922
---
Document 4:
High Wycombe Police Station, Queen Victoria Road, High Wycombe about 1935
New Police Station High Wycombe
Final score: 0.8426387084822465
---
Document 5:
Queen Victoria Rd under construction, High Wycombe, about 1901
Construction of Queen Victoria Rd north of R. Wye
Final score: 0.8400235361005131
---
