# Find Cosine Similarity Between Artists

### TODO

* Maybe: Latent semantic analysis
* Tune and evaluate hyperparameters

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

%matplotlib inline

In [2]:
tracks_by_artist = pd.read_csv("./data/complete_tracks_with_lyrics.csv")
# Decided to tokenise text as part of the TfidfVectorizer initialisation for better consistency
tracks_by_artist = tracks_by_artist[["artist", "album", "track", "lyrics"]]

In [3]:
tracks_by_artist.head()

Unnamed: 0,artist,album,track,lyrics
0,BROCKHAMPTON,iridescence,WEIGHT,[Verse 1: Kevin Abstact]\nThey split my world ...
1,BROCKHAMPTON,iridescence,VIVID,"[Intro: Matt Champion]\n""Yo, get—[censored]—tu..."
2,BROCKHAMPTON,iridescence,TAPE,"[Verse 1: Kevin Abstract]\nI can barely rap, I..."
3,BROCKHAMPTON,Saturation III,STAINS,[Verse 1: Ameer Vann]\nI spent like a year and...
4,BROCKHAMPTON,iridescence,DISTRICT,"[Intro]\n""I'm Sammy Jo, and my favorite colors..."


In [4]:
# Helper functions

def tokenizer(raw_lyrics):
    # Some choices here are specific to the format of Genius lyrics, want to remove non-vocalised 
    # text in square brackets, and other text not part of the main song body such as adlibs in 
    # round brackets
    raw_lyrics = re.sub("([\(\[].*?[\)\]])|([^\w\d'\s]+)", "", raw_lyrics)
    raw_lyrics = re.sub("\n", " ", raw_lyrics)
    # Ignore case    
    raw_lyrics = raw_lyrics.lower()
    word_list = raw_lyrics.split()

    return word_list

def normalise_vector(vector):
    return vector / np.sqrt(np.dot(vector,vector))

def add_vectors(vectors):
    return normalise_vector(np.sum(vectors, axis=0))

def ranked_words_by_weighting(vector, top_n):
    return sorted(zip(vectorizer.get_feature_names(), vector), key=lambda x: x[1], reverse=True)[:top_n]

def most_representative_songs(vector, tracks_by_artist, top_n):
    df = tracks_by_artist.copy()
    df["dist"] = df["normalised_vectors"].apply(lambda target: cosine(vector, target))
    df.sort_values("dist", inplace=True)
    return df[:top_n]

def most_similar_artists(vector, artist_vectors, top_n):
    df = artist_vectors.reset_index().copy()
    df["dist"] = df["normalised_vectors"].apply(lambda target: cosine(vector, target))
    df.sort_values("dist", inplace=True)
    return df[1:top_n+1]

In [5]:
hyperparamters = [
    {"max_df": 1.0, "min_df": 1}, # i.e. default values
    {"max_df": 1.0, "min_df": 0.001},
    {"max_df": 0.7, "min_df": 1},
    {"max_df": 0.7, "min_df": 0.001}
]

In [6]:
for hp in hyperparamters:

    vectorizer = TfidfVectorizer(
        stop_words=ENGLISH_STOP_WORDS,
        lowercase=False,
        max_df=hp["max_df"],
        min_df=hp["min_df"],
        ngram_range=(1,3),
        tokenizer=tokenizer,
        norm=None
    )
    
    tracks_by_artist_copy = tracks_by_artist.copy()
    tracks_by_artist_copy["unnormalised_vectors"] = list(vectorizer.fit_transform(tracks_by_artist_copy["lyrics"].values).toarray())
    tracks_by_artist_copy["normalised_vectors"] = tracks_by_artist_copy["unnormalised_vectors"].apply(normalise_vector)
    # Found case where unnormalised vector is zero vector, in this case an Eminem skit
    tracks_by_artist_copy = tracks_by_artist_copy[tracks_by_artist_copy["unnormalised_vectors"].apply(sum) != 0]

    artist_vectors = (
        tracks_by_artist_copy
        .groupby("artist")
        .normalised_vectors
        .apply(add_vectors)
    )
    
    for artist in artist_vectors.index.values:
        artist_vector = artist_vectors.ix[artist]
        if artist == "Lil Uzi Vert": # Example to track for now
            print("max_df=", hp["max_df"], "min_df=", hp["min_df"])
            print(artist)
            print("Most important words")
            print(ranked_words_by_weighting(artist_vector, 3))
            print("Most representative songs")
            print(most_representative_songs(artist_vector, tracks_by_artist_copy, 3)[["artist", "album", "track", "dist"]])
            print("Most similar artsts")
            print(most_similar_artists(artist_vector, artist_vectors, 3)[["artist", "dist"]])
            print('--------------------\n')

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


max_df= 1.0 min_df= 1
Lil Uzi Vert
Most important words
[('yeah', 0.14462177358038378), ('diamonds', 0.1357562179656025), ('yah', 0.12641213771053364)]
Most representative songs
           artist          album          track      dist
189  Lil Uzi Vert  Luv Is Rage 2  XO TOUR Llif3  0.577473
198  Lil Uzi Vert  Luv Is Rage 2              X  0.577473
180  Lil Uzi Vert  Luv Is Rage 2    Pretty Mami  0.736155
Most similar artsts
         artist      dist
13           NF  0.802808
8   Kodak Black  0.803255
15         Russ  0.807117
--------------------

max_df= 1.0 min_df= 0.001
Lil Uzi Vert
Most important words
[('yeah', 0.14462177358038378), ('diamonds', 0.1357562179656025), ('yah', 0.12641213771053364)]
Most representative songs
           artist          album          track      dist
189  Lil Uzi Vert  Luv Is Rage 2  XO TOUR Llif3  0.577473
198  Lil Uzi Vert  Luv Is Rage 2              X  0.577473
180  Lil Uzi Vert  Luv Is Rage 2    Pretty Mami  0.736155
Most similar artsts
         a