In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
df = pd.read_csv('../../data/smooth_df.csv')

date_column = 'Date'
date_number_column = 'Date Number'
ili_rate_column = 'ILI Rate'
query_columns = [col for col in df.columns if col not in [date_column, date_number_column, ili_rate_column]]

df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

print(df.shape)

(6933, 17203)


In [6]:
def get_model_info(model_name):
    print("Model: ", model_name)
    
    model = SentenceTransformer(model_name)
    print("Sentence Embedding Dimension: ", model.get_sentence_embedding_dimension())
    
    tokenizer = AutoTokenizer.from_pretrained(f'sentence-transformers/{model_name}')
    vocab = set(tokenizer.vocab)
    print("Vocab Length: ", len(vocab), 'NHS: ', 'nhs' in vocab)
    print('\n')

base_queries = ['Flu', 'Flu NHS', 'Influenza', 'pregnancy flu', 'baby flu', 'flu symptoms', 'how to get rid of flu', 'flu vaccine nhs', 'flu medicine']

cosine_similarity_dfs = {}

def compute_cosine_similarities(model_name):
    print("Model: ", model_name)
    
    model = SentenceTransformer(model_name)

    query_embeddings = model.encode(query_columns)

    for base_query in base_queries:
        base_query_embedding = model.encode([base_query])
        print(f"{base_query} Base Query Embeddings: ", base_query_embedding.shape)
        
        cosine_similarities = cosine_similarity(base_query_embedding, query_embeddings)
        cosine_similarities_df = pd.DataFrame(
            {"Query": query_columns, "Cosine Similarity": cosine_similarities[0]}
            ).sort_values(by='Cosine Similarity', ascending=False).reset_index(drop=True)
        cosine_similarity_dfs[base_query] = cosine_similarities_df
        cosine_similarities_df.to_csv(f'results/{base_query}_cosine_similarities.csv')

In [7]:
bert_sentence_model = 'all-MiniLM-L12-v2'
get_model_info(bert_sentence_model)
compute_cosine_similarities(bert_sentence_model)

Model:  all-MiniLM-L12-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence Embedding Dimension:  384
Vocab Length:  30522 NHS:  True


Model:  all-MiniLM-L12-v2
Flu Base Query Embeddings:  (1, 384)
Flu NHS Base Query Embeddings:  (1, 384)
Influenza Base Query Embeddings:  (1, 384)
pregnancy flu Base Query Embeddings:  (1, 384)
baby flu Base Query Embeddings:  (1, 384)
flu symptoms Base Query Embeddings:  (1, 384)
how to get rid of flu Base Query Embeddings:  (1, 384)
flu vaccine nhs Base Query Embeddings:  (1, 384)
flu medicine Base Query Embeddings:  (1, 384)


In [8]:
concatenated_df = pd.concat([df.assign(base_query=base_query) for base_query, df in cosine_similarity_dfs.items()])
average_df = concatenated_df.groupby('Query')['Cosine Similarity'].mean().reset_index().sort_values(by='Cosine Similarity', ascending=False).reset_index(drop=True)
average_df.to_csv('results/average_cosine_similarities.csv')