### $Score = Semantic Similarity + λ × Contextual Relevance$

## Goal:
* This Notebook will try to calculate the $Contextual Relevance$


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
nlp = spacy.load("en_core_web_sm")  # Load the small English model

In [None]:
input_file_path = '/Users/thebekhruz/Desktop/100Days-Of-Code/100-Days-of-NLP-Odyssey/data/raw/data_formatted_date.jsonl'
df = pd.read_json(input_file_path, lines=True)
df.head(10)

In [None]:
# Delete relations column as it does not have any meaningful information
del df['relations']

In [None]:
# Check and remove rows with empty 'mentions' dictionaries
if df['mentions'].isnull().sum()>0:
    df = df.dropna(subset=['mentions'])
    print('NaN mentions rows where removed.')
else:
    print('There are no empty mentions dictionaries in the database')   

In [None]:
# Extract specific keys from dictionaries in the 'mentions' column and create a new column
def extract_key_from_dict_list(dict_list, key):
    if isinstance(dict_list, list):
        result = [element.get(key) for element in dict_list if isinstance(element, dict) and key in element]
        return result
    else:
        return []

In [None]:

# This function applies the extraction of a key for a series in a DataFrame
def apply_extraction_to_column(df, column_name, key, new_column_name):
    df[new_column_name] = df[column_name].apply(lambda x: extract_key_from_dict_list(x, key))
    return df


df = apply_extraction_to_column(df, 'mentions', 'ne_span', 'extracted_entities')
df.head()


## TF-IDF analysis


In [None]:
# Preprocesses the given text by removing punctuation, making text lowercase, and removing stop words.
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

<!-- * If you want to calculate the TF-IDF scores for all the words in your preprocessed text, then you should use preprocessed_text.
* If you are specifically interested in the TF-IDF scores for the named entities (i.e., ne_span) only, then you should use preprocessed_entities. -->

$ Score = Semantic Similarity   * w * Contextual Relevance (TF-IDF) $
#### Significance of Named Entities Within Documents:
* To prioritize named entities in documents, focus on *preprocessed_entities* for TF-IDF calculations. This emphasizes entity importance independently of surrounding text.
#### Semantic Similarity and Contextual Relevance:
* TF-IDF fine-tunes the $Score$ by giving more importance to specific named entities. Using *preprocessed_entities* provides a focused relevance score on the entities without diluting the effect sorounding text.


In [None]:
# Apply preprocess_text function on the text and exctracted_entities column
# Uncomment this if you think that contextual information is important
    # df['preprocessed_text'] = df['text'].apply(preprocess_text)
df['preprocessed_entities'] = df['extracted_entities'].apply(lambda x: [preprocess_text(entity) for entity in x])


In [None]:
# Concatenate all arrays to create a single list of preprocessed entities across all documents
all_entities = sum(df['preprocessed_entities'], [])

# Convert this list into a string where each entity is separated by a space (to simulate a "document" of entities)
entities_text = ' '.join(all_entities)

# Create a "document" for each set of entities in each row to calculate TF-IDF scores
entities_documents = [' '.join(entities) for entities in df['preprocessed_entities']]

# Initialize the vectorizer
entity_vectorizer = TfidfVectorizer()

# Fit and transform the entities documents to calculate TF-IDF
entity_tfidf_matrix = entity_vectorizer.fit_transform(entities_documents)

# tfidf_df = pd.DataFrame(entity_tfidf_matrix.toarray(), columns=entity_vectorizer.get_feature_names_out())
tfidf_df = pd.DataFrame(entity_tfidf_matrix.toarray(), index=df['IAID'], columns=entity_vectorizer.get_feature_names_out())

# Print the shape of the TF-IDF matrix
print(f'The shape of the TF-IDF matrix is: {entity_tfidf_matrix.shape}')


In [None]:
# Function to get the TF-IDF score for a word in a specific document using IAID
def get_tfidf_score(word, vectorizer, tfidf_df, iaid):
    index = vectorizer.vocabulary_.get(word)
    # If the word is in the vocabulary, return its score for the specific document based on IAID
    if index is not None:
        return tfidf_df.loc[iaid, vectorizer.get_feature_names_out()[index]]
    else:
        # If the word is not in the vocabulary, return 0
        return 0

# Function to calculate the total TF-IDF score for each mention's ne_span for a specific document using IAID
def add_tfidf_scores_to_mentions(row, vectorizer, tfidf_df):
    mentions = row['mentions']
    iaid = row['IAID']  # Use IAID to reference the document in tfidf_df
    for mention in mentions:
        words = mention['ne_span'].lower().split()
        # Note: Ensure that `preprocess_text` is applied here if necessary, as per your preprocessing logic
        total_score = sum(get_tfidf_score(word, vectorizer, tfidf_df, iaid) for word in words)
        mention['total_tfidf_score'] = total_score

# Apply the function to each row in the DataFrame
df.apply(lambda row: add_tfidf_scores_to_mentions(row, entity_vectorizer, tfidf_df), axis=1)
df['mentions'][0][0]



### Prepare data for saving

In [None]:
# Delete columns that are no longer needed to descrease the data size
# del df['preprocessed_text'] # If you have used it. Uncomment it
del df['preprocessed_entities']
del df['extracted_entities']

In [None]:
# Save new data as 'data_with_contextual_relevance.jsonl'
output_file_path = '/Users/thebekhruz/Desktop/100Days-Of-Code/100-Days-of-NLP-Odyssey/data/processed/data_with_contextual_relevance.jsonl'
df.to_json(output_file_path, orient='records', lines=True)
print(f'Data saved to {output_file_path}')