In [2]:
import gc, os, pickle, sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
from random import sample, shuffle
from math import ceil

# EMBEDDINGS
from sklearn.metrics.pairwise import cosine_similarity # ALL
from bert_serving.client import BertClient             # BERT
from bert_serving.server.graph import optimize_graph   # BERT
from bert_serving.server.helper import get_args_parser # BERT
import spacy                                           # GLoVe
import tensorflow as tf                                # USE & BERT
import tensorflow_hub as hub                           # USE
from gensim.test.utils import datapath                 # Word2Vec & FastText
from gensim.models import KeyedVectors                 # Word2Vec
from gensim.models.fasttext import load_facebook_vectors, load_facebook_model # FastText

DB_PATH = 'data/db/spotify.db'
SEED = 413

# Embedding Similarity Across Models

**The following notebook includes Semantic Similarity comparisons between several different models**:
 - BERT
 - Universal Sentence Encoder
 - Word2Vec
 - GLoVe
 - FastText

The Semantic Similarity scores are will be used to evaluate whether or not a playlist name, returned by Spotify's API, is relevant to the search query which was used. This is part of a larger project to build a new type of spectrogram.

**Data to Generate**:
 - Pln Similarity:
     - Max of each query
     - Avg of each query
 - Max Token Similarity:
     - Max of each query against each pln token
     - Avg of max of each query against each pln token

In [6]:
def process_query_operators(query: str):
    '''Convert query into a list, splitting where there is a Spotify search operator'''
    if 'NOT' in query:
        return [query.split(' NOT ')[0]]
    elif 'OR' in query:
        return query.split(' OR ')
    elif 'AND' in query:
        return query.split(' AND ')
    return [query]

def get_pln_similarity(queries: list, pln: str, row_idx: int, sim_func):
    '''Get the similarity between the query (or queries if an operator was used) and the playlist name.'''
    if row_idx % 10000 == 0: print(f'Row: {row_idx}')
    try:
        sims = [sim_func(q, pln) for q in queries]
        return max(sims), np.mean(sims)
    except ValueError as e:
        print(f'ValueError: {e}')
        return np.nan
    except KeyError as e:
        print(f'KeyError: {e}')
        return np.nan

def get_max_token_similarity(queries: list, pln: str, row_idx: int, sim_func):
    '''Get the max similarities of the query (or queries if operator was used) compared to each token of the pln.'''
    if row_idx % 10000 == 0: print(f'Row: {row_idx}')

    # check if q|pln is in q|pln
    for q in queries:
        if   q in pln: return 1., 1.
        elif pln in q: return 1., 1.

    # return the max of the similarity for each token
    try:
        sim_max = [max([sim_func(q, token) for token in pln.split()]) for q in queries]
        return max(sim_max), np.mean(sim_max)
    except ValueError as e:
        print(f'ValueError: {e}')
        return np.nan
    except KeyError as e:
        print(f'KeyError: {e}')
        return np.nan

def add_model_avg_similarity(df, model: str):
    '''Calculate the averages between similarity scores for a given model'''
    tdf = df[[col for col in df.columns if f'{model}_' in col]]
    df[f'{model}_overall_avg_max_similarity'] = (tdf[f'{model}_max_pln_similarity'] +
                                                 tdf[f'{model}_max_token_max_similarity']) / 2
    df[f'{model}_overall_avg_avg_similarity'] = (tdf[f'{model}_avg_pln_similarity'] +
                                                 tdf[f'{model}_avg_token_max_similarity']) / 2
    df[f'{model}_overall_similarity']          = tdf.mean(1)
    return df

# Load Data

In [8]:
df = pd.read_feather('processed_batch_uniq_plns_spec_char_edit')
df = deepcopy(df[df.playlist_name.str.len() > 0])
df['proc_query'] = df['query'].apply(process_query_operators)

In [9]:
# get unique queries and plns from data
queries = df['query'].unique().tolist()
plns = df.playlist_name.unique().tolist()

# process plns with search operators
proc_queries = []
for q in queries:
    q = process_query_operators(q)
    if type(q) == list: proc_queries += q
    else: proc_queries.append(q)

# dedup
queries = list(set(proc_queries))
del proc_queries

# add to single list for quicker processing
all_queries_and_plns = queries + plns

# get pln tokens
for pln in plns:
    if ' ' in pln:
        all_queries_and_plns += pln.split()

# dedup
all_queries_and_plns = list(set(all_queries_and_plns))
all_tokens = list(set([t for tt in [token.split() for token in all_queries_and_plns] for t in tt]))

# pickle for BERT
# with open('all_queries_and_plns_list.pkl', 'wb') as f:
#     pickle.dump(all_queries_and_plns, f, protocol = pickle.HIGHEST_PROTOCOL)

print(f'All Tokens Count:           {len(all_tokens)}')
print(f'All Queries & PLNs Count:  {len(all_queries_and_plns)}')

All Tokens Count:           96923
All Queries & PLNs Count:  540612


# Get BERT Similarity

## `BERT & TensorFlow`

Note: Using the client/server model provided by `bert-as-a-service` was good for one off tasks, but was incredibly slow for this use case. To remedy that, I exported the TennsorFlow graph and generate embeddings for each of the token and documents (in this case, `playlist_name`) and then calculated similiarty here.

To see how the embeddings were generated, see the `BERT Embeddings (on GPU)` notebook in this repo.

In [84]:
def get_bert_doc_similarity(query: str, pln: str):
    '''Calculate similarity using BERT using the document embeddings (SEQ_LEN = 22)'''
    return cosine_similarity(bert_doc_embeds[query], bert_doc_embeds[pln])[0][0]

def get_bert_token_similarity(query: str, pln: str):
    '''Calculate similarity using BERT using the token embeddings (min, SEQ_LEN = 4)'''
    return cosine_similarity(bert_doc_embeds[query], bert_token_embeds[pln])[0][0]

In [11]:
with open('bert_doc_embeds.pkl', 'rb') as f:
    bert_doc_embeds = pickle.load(f)

with open('bert_token_embeds.pkl', 'rb') as f:
    bert_token_embeds = pickle.load(f)

In [9]:
df.sample(5)

Unnamed: 0,query,track_id,playlist_name,search_rank,proc_query
1154108,happy hardcore,7brm4QECDelvH883WSJZgK,happy hardcore,448,[happy hardcore]
353278,gleeful,4hBW3h6FnQNh7NRmyxLLG7,gleeful,70,[gleeful]
1003741,introspection,1yepm3wDAHedCHhxVKzTtL,introspective,1504,[introspection]
533284,silly,4Km5HrUvYTaSUfiSGPJeQR,silly salmon,562,[silly]
227253,pop,09IStsImFySgyp0pIQdqAc,clean pop playlist,334,[pop]


In [85]:
df['bert_max_pln_similarity'], df['bert_avg_pln_similarity'] = zip(
    *df.apply(lambda row: get_pln_similarity(row['proc_query'],
                                             row['playlist_name'],
                                             row.name,
                                             get_bert_doc_similarity
                                            ), axis = 1)
)

df['bert_max_token_max_similarity'], df['bert_avg_token_max_similarity'] = zip(
    *df.apply(lambda row: get_max_token_similarity(row['proc_query'],
                                                   row['playlist_name'],
                                                   row.name,
                                                   get_bert_token_similarity
                                                  ), axis = 1)
)

df = add_model_avg_similarity(df, 'bert')

In [39]:
#df.drop('proc_query', 1).reset_index(drop = True).to_feather('final_uniq_plns_WITH_BERT')

In [38]:
df.shape

(1402119, 12)

# Universal Sentence Encoder

Reference: http://www.nishanpantha.com.np/programming/universal-sentence-encoder-semantic-search.html

In [90]:
def get_USE_embeddings(text, embed_model):
    if type(text) is str:
        text = [text]
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        return sess.run(embed_model(text))

def get_emb_dict(word_list, model):
    return {w: e.reshape(1, -1) for w, e in zip(word_list, get_USE_embeddings(word_list, model))}

## Get Embeddings

**Note**: USE Embeddings are quite memory intensive, unless you are working on a remote machine then you'll probably only be able to load one model at a time.

In [11]:
DAN_module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
TRANS_module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"

In [9]:
MODEL = 'transformer'
#MODEL = 'dan'

In [5]:
# set model
if MODEL == 'transformer':
    embed = hub.Module(TRANS_module_url)
    write_fname = 'guse_all-data_new-spec-char_transformer.pkl'

if MODEL == 'dan':
    embed = hub.Module(DAN_module_url)
    write_fname = 'guse_all-data_new-spec-char_dan.pkl'

In [4]:
batches = 100
batch_size = len(all_queries_and_plns) // batches

saved_emb = {}

for i in range(batches):
    
    print(f'Batch: {i+1}')
    print(f'Saved Emb Size: {len(saved_emb)}')
    print()
    
    batch = list(all_queries_and_plns[batch_size*i:batch_size*(i+1)])
    saved_emb = {**saved_emb, **get_emb_dict(batch, embed)}
    
    with open(write_fname, 'wb') as f:
        pickle.dump(saved_emb, f, protocol = pickle.HIGHEST_PROTOCOL)

## Calculate Similarity

In [17]:
def get_use_transformer_similarity(query: str, pln: str):
    '''Calculate similarity using USE using the Transformer embeddings'''
    return cosine_similarity(emb_t[query], emb_t[pln])[0][0]

def get_use_dan_similarity(query: str, pln: str):
    '''Calculate similarity using USE using the DAN embeddings'''
    return cosine_similarity(emb_d[query], emb_d[pln])[0][0]

In [13]:
with open('guse_all-data_new-spec-char_transformer.pkl', 'rb') as f:
    emb_t = pickle.load(f)
    
with open('guse_all-data_new-spec-char_dan.pkl', 'rb') as f:
    emb_d = pickle.load(f)

In [14]:
df.sample(3)

Unnamed: 0,query,track_id,playlist_name,search_rank,bert_max_pln_similarity,bert_avg_pln_similarity,bert_max_token_max_similarity,bert_avg_token_max_similarity,bert_overall_avg_max_similarity,bert_overall_avg_avg_similarity,bert_overal_similarity,proc_query
1036008,smoke,0TCnOEVeLQMXOUrpPlM7uY,when you high,519,0.848013,0.848013,0.892566,0.892566,0.870289,0.870289,0.870289,[smoke]
1004237,melancholic,7wwp2dzuw5SyB5Up6Gf5O2,melancholic chill,1095,0.938558,0.938558,1.0,1.0,0.969279,0.969279,0.969279,[melancholic]
30374,deep house,61HZRStXWgL7CXkw3yW2rv,em casa sunset,966,0.889679,0.889679,0.855499,0.855499,0.872589,0.872589,0.872589,[deep house]


In [21]:
use_models = {'transformer': get_use_transformer_similarity,
              'dan': get_use_dan_similarity
             }

for model_name, use_sim_func in use_models.items():
    
    print(f'\nAdding pln_similarity for: {model_name}\n')
    df[f'use_{model_name}_max_pln_similarity'], df[f'use_{model_name}_avg_pln_similarity'] = zip(
        *df.apply(lambda row: get_pln_similarity(row['proc_query'],
                                                 row['playlist_name'],
                                                 row.name,
                                                 use_sim_func,
                                                ), axis = 1)
    )
    
    print(f'\nAdding token_max_similarity for: {model_name}\n')
    df[f'use_{model_name}_max_token_max_similarity'], df[f'use_{model_name}_avg_token_max_similarity'] = zip(
        *df.apply(lambda row: get_max_token_similarity(row['proc_query'],
                                                       row['playlist_name'],
                                                       row.name,
                                                       use_sim_func,
                                                      ), axis = 1)
    )
    
    print(f'\nAdding overall_similarity for: {model_name}\n')
    df = add_model_avg_similarity(df, f'use_{model_name}')

# add extra avg overall col for use since there are two different models
df['use_overall_similarity'] = (df.use_transformer_overall_avg_avg_similarity +
                                df.use_dan_overall_avg_avg_similarity) / 2

In [22]:
df.sample(3)

Unnamed: 0,query,track_id,playlist_name,search_rank,bert_max_pln_similarity,bert_avg_pln_similarity,bert_max_token_max_similarity,bert_avg_token_max_similarity,bert_overall_avg_max_similarity,bert_overall_avg_avg_similarity,...,use_transformer_overall_avg_avg_similarity,use_transformer_overall_similarity,use_dan_max_pln_similarity,use_dan_avg_pln_similarity,use_dan_max_token_max_similarity,use_dan_avg_token_max_similarity,use_dan_overall_avg_max_similarity,use_dan_overall_avg_avg_similarity,use_dan_overall_similarity,use_overall_similarity
1288606,sport OR sports,2LPUvD5DDOO4UYGkWgjI2C,sport,1109,1.0,0.960884,1.0,1.0,1.0,0.980442,...,0.969096,0.984548,1.0,0.919925,1.0,1.0,1.0,0.959963,0.979981,0.96453
437517,commute,1etiUDkISHELzQGMY79ryt,commute,1032,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
637871,unforgettable,2xjMTChMsBkonVJJtO2kZZ,the best of opera,75,0.791004,0.791004,0.78657,0.78657,0.788787,0.788787,...,0.556367,0.556367,0.445041,0.445041,0.63021,0.63021,0.537625,0.537625,0.537625,0.546996


In [24]:
df.drop('proc_query', 1).reset_index(drop = True).to_feather('final_uniq_plns_WITH_BERT&USE')

# Word2Vec

Word2Vec doesn't support OOV tokens so, depending on your use case, they might not be that helpful.

In [10]:
def get_w2v_similarity(query: str, pln: str):
    '''Calculate similarity using Word2Vec using Gensim'''
    return m.similarity(query, pln)

In [1]:
w2v_path = datapath('word2vec_pretrain/GoogleNews-vectors-negative300.bin')
m = KeyedVectors.load_word2vec_format(w2v_path, binary = True)

In [12]:
df.sample(3)

Unnamed: 0,query,track_id,playlist_name,search_rank,bert_max_pln_similarity,bert_avg_pln_similarity,bert_max_token_max_similarity,bert_avg_token_max_similarity,bert_overall_avg_max_similarity,bert_overall_avg_avg_similarity,...,ft_wiki_overall_avg_avg_similarity,ft_wiki_overall_similarity,ft_crawl_max_pln_similarity,ft_crawl_avg_pln_similarity,ft_crawl_max_token_max_similarity,ft_crawl_avg_token_max_similarity,ft_crawl_overall_avg_max_similarity,ft_crawl_overall_avg_avg_similarity,ft_crawl_overall_similarity,proc_query
558968,motorcycle,60SdxE8apGAxMiRrpbmLY0,motorcycle,1031,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[motorcycle]
1309140,charts,4S8d14HvHb70ImctNgVzQQ,charts,4762,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[charts]
1069690,90s OR nineties,19YkX6WwgFKwCQ5edvLWvn,rock nacional anos 90 anos 2000,1143,0.74047,0.690026,0.841927,0.770233,0.791198,0.73013,...,0.109992,0.196221,0.319526,0.258843,0.566329,0.553025,0.442928,0.405934,0.424431,"[90s, nineties]"


In [17]:
df['w2v_max_token_max_similarity'], df['w2v_avg_token_max_similarity'] = zip(
    *df.apply(lambda row: get_max_token_similarity(row['proc_query'],
                                                   row['playlist_name'],
                                                   row.name,
                                                   get_w2v_similarity
                                                  ), axis = 1)
)

In [None]:
df.drop('proc_query', axis=1).reset_index(drop = True).to_feather('final_uniq_plns_WITH_BERT&USE&W2V')

In [None]:
del m, w2v_path

# GLoVe

In [None]:
# from warnings import filterwarnings
# filterwarnings("ignore", category = UserWarning)

def get_glove_similarity(query: str, pln: str):
    '''Calculate similarity using GLoVe using spaCy'''
    return nlp(query).similarity(nlp(pln))

In [6]:
# large model: en_core_web_lg
# vec for sim: en_vectors_web_lg
nlp = spacy.load("en_vectors_web_lg")

In [18]:
print(f'\nAdding pln_similarity\n')
df['glove_max_pln_similarity'], df['glove_avg_pln_similarity'] = zip(
    *df.apply(lambda row: get_pln_similarity(row['proc_query'],
                                             row['playlist_name'],
                                             row.name,
                                             get_glove_similarity
                                            ), axis = 1)
)

print(f'\nAdding token_max_similarity\n')
df['glove_max_token_max_similarity'], df['glove_avg_token_max_similarity'] = zip(
    *df.apply(lambda row: get_max_token_similarity(row['proc_query'],
                                                   row['playlist_name'],
                                                   row.name,
                                                   get_glove_similarity
                                                  ), axis = 1)
)

print(f'\nAdding overall_similarity\n')

df = add_model_avg_similarity(df, 'glove')

In [12]:
df.sample(3)

Unnamed: 0,query,track_id,playlist_name,search_rank,bert_max_pln_similarity,bert_avg_pln_similarity,bert_max_token_max_similarity,bert_avg_token_max_similarity,bert_overall_avg_max_similarity,bert_overall_avg_avg_similarity,...,use_dan_overall_similarity,use_overall_similarity,proc_query,glove_max_pln_similarity,glove_avg_pln_similarity,glove_max_token_max_similarity,glove_avg_token_max_similarity,glove_overall_avg_max_similarity,glove_overall_avg_avg_similarity,glove_overall_similarity
1050158,world,0Kjc4mrh7uoKPfotJGhvV1,alan jackson complete collection,466,0.784164,0.784164,0.890593,0.890593,0.837379,0.837379,...,0.367671,0.345408,[world],0.381631,0.381631,0.329549,0.329549,0.35559,0.35559,0.35559
1150884,musical,55do1f4mkLfM314tQDlyfw,chill acoustic music,981,0.853741,0.853741,0.904954,0.904954,0.879348,0.879348,...,0.568649,0.602636,[musical],0.632764,0.632764,0.724151,0.724151,0.678457,0.678457,0.678457
1352038,flute,5yaD6JCt7c0zvK4bSW5P11,flute,2304,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,[flute],1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
df.reset_index(drop = True).drop('proc_query', 1).to_feather('final_uniq_plns_WITH_BERT&USE&W2V&GLOVE'); df.shape

(1402119, 34)

In [14]:
del nlp

# FastText

In [8]:
def get_fasttext_similarity(query: str, pln: str):
    '''Calculate similarity using FastText using Gensim'''
    return m.similarity(query, pln)

## English Wikipedia

In [9]:
wiki_en_path  = datapath('fasttext_pretrain/wiki_en.bin')
m = load_facebook_vectors(wiki_en_path)

In [19]:
print(f'\nAdding pln_similarity\n')

df['ft_wiki_max_pln_similarity'], df['ft_wiki_avg_pln_similarity'] = zip(
    *df.apply(lambda row: get_pln_similarity(row['proc_query'],
                                             row['playlist_name'],
                                             row.name,
                                             get_fasttext_similarity
                                            ), axis = 1)
)

print(f'\nAdding token_max_similarity\n')

df['ft_wiki_max_token_max_similarity'], df['ft_wiki_avg_token_max_similarity'] = zip(
    *df.apply(lambda row: get_max_token_similarity(row['proc_query'],
                                                   row['playlist_name'],
                                                   row.name,
                                                   get_fasttext_similarity
                                                  ), axis = 1)
)

print(f'\nAdding overall_similarity\n')

df = add_model_avg_similarity(df, 'ft_wiki')

In [10]:
df.sample(3)

Unnamed: 0,query,track_id,playlist_name,search_rank,bert_max_pln_similarity,bert_avg_pln_similarity,bert_max_token_max_similarity,bert_avg_token_max_similarity,bert_overall_avg_max_similarity,bert_overall_avg_avg_similarity,...,glove_overall_avg_avg_similarity,glove_overall_similarity,proc_query,ft_wiki_max_pln_similarity,ft_wiki_avg_pln_similarity,ft_wiki_max_token_max_similarity,ft_wiki_avg_token_max_similarity,ft_wiki_overall_avg_max_similarity,ft_wiki_overall_avg_avg_similarity,ft_wiki_overall_similarity
975632,feeling good OR feeling great OR feeling happy,1rB5RVav3dKGJVwquqPXOK,good songs albums that will probably make me f...,1117,0.882575,0.817463,0.893242,0.830386,0.887909,0.823924,...,0.83437,0.845958,"[feeling good, feeling great, feeling happy]",0.676448,0.610076,0.527879,0.493758,0.602163,0.551917,0.57704
333232,forgiveness,2GbCXMnXVdPwLyJLY6hjSM,david gray please forgive me,588,0.848999,0.848999,0.859454,0.859454,0.854227,0.854227,...,0.509171,0.509171,[forgiveness],0.508786,0.508786,0.741231,0.741231,0.625009,0.625009,0.625009
387340,hypnotic,6bXSYxP6WDlNCr6Jk1H3MW,hypnotized youngboy never broke again,875,0.664611,0.664611,0.765019,0.765019,0.714815,0.714815,...,0.433202,0.433202,[hypnotic],0.440686,0.440686,0.657368,0.657368,0.549027,0.549027,0.549027


In [11]:
df.reset_index(drop = True).drop('proc_query', 1).to_feather('final_uniq_plns_WITH_BERT&USE&W2V&GLOVE&FT'); df.shape

(1402119, 41)

In [12]:
del m

## Web Crawl

In [10]:
webcrawl_path = datapath('fasttext_pretrain/crawl-300d-2M-subword.bin')
m = load_facebook_vectors(webcrawl_path)

In [20]:
print(f'\nAdding pln_similarity\n')

df['ft_crawl_max_pln_similarity'], df['ft_crawl_avg_pln_similarity'] = zip(
    *df.apply(lambda row: get_pln_similarity(row['proc_query'],
                                             row['playlist_name'],
                                             row.name,
                                             get_fasttext_similarity
                                            ), axis = 1)
)

print(f'\nAdding token_max_similarity\n')

df['ft_crawl_max_token_max_similarity'], df['ft_crawl_avg_token_max_similarity'] = zip(
    *df.apply(lambda row: get_max_token_similarity(row['proc_query'],
                                                   row['playlist_name'],
                                                   row.name,
                                                   get_fasttext_similarity
                                                  ), axis = 1)
)

print(f'\nAdding overall_similarity\n')

df = add_model_avg_similarity(df, 'ft_crawl')

In [17]:
df.reset_index(drop = True).drop('proc_query', 1).to_feather('final_uniq_plns_WITH_BERT&USE&W2V&GLOVE&FT'); df.shape

# Next Steps

 - Depending on how well an ensemble of these scores works to filter clean the data, I might also try:
     - **ULMFiT**: fine-tuned with an entertainment / music Language Model, potentially Pitchfork or other music blogs
     - **XLNet**: although the Semantic Similarity improvements from BERT to XLNet were negligible when introducing Whole Word Masking, and as of now, a strong performing uncased version isn't available, they did use a larger vocabulary to train the model so it might yield better results with user-gernerated data such as I have.
     - **ELMo**: not really intended for this downstream task, but could be interesting to try.