## Generating Document Embeddings

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load("en_core_web_sm")  # Load the small English model

In [2]:
input_file_path = '/Users/thebekhruz/Desktop/100Days-Of-Code/100-Days-of-NLP-Odyssey/data/processed/data_with_contextual_relevance.jsonl'

try:
    df = pd.read_json(input_file_path, lines=True)
    print('Data Frame loaded as df')
except Exception as e:
    print(e)

Data Frame loaded as df


In [3]:
# Check the data:
# The data is in the desired format if the output is False for all.
df.isna().value_counts()

IAID   text   mentions
False  False  False       1983
Name: count, dtype: int64

In [4]:
# Preprocesses the given text by removing punctuation,lemmatising the words,
    # making text lowercase, and removing stop words.
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Preprocess the text in the 'text' column
df['preprocessed_text'] = df['text'].apply(preprocess_text)


In [5]:
# Initialize the SentenceTransformer model with 'all-mpnet-base-v2'.
# This model is a versatile, all-purpose model for English sentence embeddings,
# trained on a large and diverse dataset of sentences.
model = SentenceTransformer("msmarco-distilbert-dot-v5")


In [6]:
# Encode the text with 'all-mpnet-base-v2' model
df['text_embedding'] = df['preprocessed_text'].apply(lambda x: model.encode(x))

In [8]:
df.head()

Unnamed: 0,IAID,text,mentions,preprocessed_text,text_embedding
0,a7bb9917-95ff-3f55-a640-4c5afcec25f2,View towards SE of junction of Queen Victoria ...,"[{'ne_span': 'Queen Victoria Road', 'ne_start'...",view se junction queen victoria road high st e...,"[-0.036429767, 0.43713483, 0.050612718, -0.088..."
1,c29a7b77-7c46-3b85-88fe-05c8f4b2e384,"Front page of Bucks Free Press, Time capsule f...","[{'ne_span': 'Bucks Free Press', 'ne_start': 1...",page bucks free press time capsule clock house...,"[0.24973764, 0.45752427, 0.47631216, -0.098070..."
2,196c11e6-f7b6-392f-ae41-28653345087c,"High Wycombe Police Station, in Queen Victoria...","[{'ne_span': 'High Wycombe Police Station', 'n...",high wycombe police station queen victoria roa...,"[-0.021879869, 0.22485399, 0.26114935, 0.11417..."
3,7a5aace6-2398-3dcf-8843-37ff6ccea875,"Reference Library door, Queen Victoria Rd, Hig...","[{'ne_span': 'Reference Library', 'ne_start': ...",reference library door queen victoria rd high ...,"[0.1924246, 0.40368715, 0.1478213, -0.11580495..."
4,c66c4715-c03a-3aab-964b-e733f3ff1cf4,"Terrace of brick and flint cottages, Beech Rd,...","[{'ne_span': 'Beech Rd', 'ne_start': 37, 'ne_e...",terrace brick flint cottage beech rd wycombe m...,"[0.05957327, 0.42521647, 0.3429314, -0.1636258..."


In [28]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(query, model, k=5):
    # Preprocess the query
    query = preprocess_text(query)
    
    # Encode the query
    query_embedding = model.encode([query])
    
    # Calculate similarity scores
    similarity_scores = cosine_similarity(query_embedding, np.array(df['text_embedding'].tolist())).flatten()
    
    # Get indices of top k scores
    top_k_indices = np.argsort(similarity_scores)[-k:][::-1]
    
    # Retrieve and print the corresponding texts
    for index in top_k_indices:
        most_similar_text = df.iloc[index]['text']
        print(f"Similar text: {most_similar_text}\nSimilarity score: {similarity_scores[index]}\n---")



In [36]:
# Prepere the data for saving
del df['preprocessed_text']
df.head()


Unnamed: 0,IAID,text,mentions,text_embedding
0,a7bb9917-95ff-3f55-a640-4c5afcec25f2,View towards SE of junction of Queen Victoria ...,"[{'ne_span': 'Queen Victoria Road', 'ne_start'...","[-0.036429767, 0.43713483, 0.050612718, -0.088..."
1,c29a7b77-7c46-3b85-88fe-05c8f4b2e384,"Front page of Bucks Free Press, Time capsule f...","[{'ne_span': 'Bucks Free Press', 'ne_start': 1...","[0.24973764, 0.45752427, 0.47631216, -0.098070..."
2,196c11e6-f7b6-392f-ae41-28653345087c,"High Wycombe Police Station, in Queen Victoria...","[{'ne_span': 'High Wycombe Police Station', 'n...","[-0.021879869, 0.22485399, 0.26114935, 0.11417..."
3,7a5aace6-2398-3dcf-8843-37ff6ccea875,"Reference Library door, Queen Victoria Rd, Hig...","[{'ne_span': 'Reference Library', 'ne_start': ...","[0.1924246, 0.40368715, 0.1478213, -0.11580495..."
4,c66c4715-c03a-3aab-964b-e733f3ff1cf4,"Terrace of brick and flint cottages, Beech Rd,...","[{'ne_span': 'Beech Rd', 'ne_start': 37, 'ne_e...","[0.05957327, 0.42521647, 0.3429314, -0.1636258..."


In [37]:
# Save the file 
output_file_path = '/Users/thebekhruz/Desktop/100Days-Of-Code/100-Days-of-NLP-Odyssey/data/processed/data_with_contextual_relevance_and_doc_embddings.jsonl'
df.to_json(output_file_path, orient='records', lines=True)
print(f'Data saved to {output_file_path}')

Data saved to /Users/thebekhruz/Desktop/100Days-Of-Code/100-Days-of-NLP-Odyssey/data/processed/data_with_contextual_relevance_and_doc_embddings.jsonl


## Semantic Search, putting everything together.

In [None]:
# Load data 



##### Testing

In [29]:
# General Query about a Location: 
query = "What is Queen Victoria Road in High Wycombe known for?"
search(query, model)

Similar text: Queen Victoria Rd under construction, High Wycombe, about 1901
Construction of Queen Victoria Rd north of R. Wye
Similarity score: 0.9143281579017639
---
Similar text: High Wycombe Police Station, Queen Victoria Road, High Wycombe about 1935
New Police Station High Wycombe
Similarity score: 0.908988893032074
---
Similar text: New Police Station in Queen Victoria Road, High Wycombe in 1937
New Police Station High Wycombe
Similarity score: 0.8999471068382263
---
Similar text: High Wycombe Police Station, in Queen Victoria Road, High Wycombe. Oct 1935
New Police Station High Wycombe viewed from opposite side of the road
Similarity score: 0.8994211554527283
---
Similar text: High Wycombe Police Station and Town Hall Queen Victoria Road, High Wycombe. late 1935
Police Station and Town Hall viewed side on and commemorative bridge over River Wye on western side
Similarity score: 0.8928785920143127
---


In [11]:
# Specific Event or Item Query: 
query = "Information about the time capsule found in High Wycombe."
search(query, model, k=3)

# The model does not perform that well for specific events. When considering only Semanticity and not contextual relevance. 

Similar text: Postcard showing a view of the entrance hall Wycombe Abbey School, High Wycombe. date unknown
Interior of Wycombe Abbey School, entrance hall with American Plaque, Gothic style window and ceiling, table chairs and carpet, picture/hanging over fireplace
Similarity score: 0.8265578746795654
---
Similar text: View of S. side of Wycombe Abbey School, High Wycombe, date unknown
View of Wycombe Abbey School showing cloisters on S. side
Similarity score: 0.8248274326324463
---
Similar text: High Wycombe Police Station, Queen Victoria Road, High Wycombe about 1935
New Police Station High Wycombe
Similarity score: 0.8244389295578003
---


In [12]:
# Cultural or Historical Site Query: 
query = "Where is the Reference Library located in High Wycombe?"
search(query, model, k=3)



Similar text: Two Library staff in the Reference Library, Queen Victoria Rd, High Wycombe. about 1992
Reference Library
Similarity score: 0.9056596159934998
---
Similar text: Reference Library, Queen Victoria Rd, High Wycombe. 1993 to 1994
Reference Library
Similarity score: 0.8937884569168091
---
Similar text: Reference Library door, Queen Victoria Rd, High Wycombe. about 1992
Corridor entrance to Reference Library
Similarity score: 0.8874964714050293
---


In [13]:
# Broad Historical or Cultural Query:
query = "Historical landmarks in High Wycombe."
search(query, model, k=3)



Similar text: Looking SE, a view of the N front of Wycombe Abbey from the drive, Abbey Grounds, High Wycombe. Circa 1895
The North front of Wycombe Abbey, from the drive
Similarity score: 0.862769603729248
---
Similar text: High Wycombe Police Station, Queen Victoria Road, High Wycombe about 1935
New Police Station High Wycombe
Similarity score: 0.8619149923324585
---
Similar text: Looking N, a view of the front of Wycombe Museum (formerly Castle Hill House), with two benches against the wall. Priory Ave, High Wycombe. early 1990's
Front of Wycombe Museum, formerly Castle Hill House, with two benches against the wall The drive runs along the right.
Similarity score: 0.8610037565231323
---


### Implementing the formula.