In [1]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel #same as cosine similarity
from collections import Counter

In [2]:
#Global variables
annotation_to_song = {} # annotation_id as key and song_id as value
song_to_name = {} #song_id to name of song
annotation_to_text = {} #annotation_id to annotation text
annotation_to_fragment = {} #annotation_id to lyric fragment

with open('songs.json') as json_file:  
    all_songs = json.load(json_file)
    

In [3]:
keys = all_songs.keys()
for key in all_songs.keys():
    first_song = all_songs[key]
    print(first_song)
    break

{'referents': [{'url': 'https://genius.com/11691116/Kendrick-lamar-yah/My-girl-told-me-dont-let-these-hoes-get-in-my-head-my-world-been-ecstatic-i-checked-the-signal-that-read', 'annotations': [{'id': 11691116, 'annotation': 'Kendrick’s girl is Whitney Alford, his high school sweetheart and fiancé. She reminds him not to let women manipulate him, since he let it happen before and it had a negative outcome—he got beat up.\n\nHe plays on the word ecstatic and the word “buzz” in the chorus. Static can occur when electric charges accumulate on the body, creating a shock or buzz when coming into contact with other materials.\n\n“Checked the signal that read,” ties into the chorus  where signals are detected by his radar. It’s also a play on red lights at traffic signals, which ties back to the car in the previous lines.', 'votes_total': 45, 'verified': False}], 'lyric': "My girl told me don't let these hoes get in my head\n My world been ecstatic, I checked the signal that read—", 'id': 116

In [4]:
def create_dictionarys(json_file="songs.json", annotation_to_song={}, song_to_name={},
                       annotation_to_text={}, annotation_to_fragment={},
                      ):
    """
    Using songs.json as json_data
    Creates annotation dictionary: {annotation_id:[song_id,fragment/text,annotation_text]}
    """
    
    #load song json file
    with open(json_file) as song_json: 
        songs = json.load(song_json)
        
        #iterate through all songs and input data accordingly
        for song_id in songs:
            song_data = songs[song_id]
            
            if song_id not in song_to_name:
                song_to_name[song_id] = song_data["full_title"]
                if "king kunta" in song_data["full_title"].lower():
                    print(song_data["full_title"])
            
            #process annotations
            for referent in song_data["referents"]:
                lyric_fragment = referent["lyric"]
                for annotation in referent["annotations"]:
                    annotation_id = annotation["id"]
                    annotation_text = annotation["annotation"]
                
                    annotation_votes = annotation["votes_total"] # here is where we would record vote numbers

                    if annotation_votes >= 1:
                        if annotation_id not in annotation_to_song:
                            annotation_to_song[annotation_id] = song_id

                        if annotation_id not in annotation_to_text:
                            annotation_to_text[annotation_id] = annotation_text

                        if annotation_id not in annotation_to_fragment:
                            annotation_to_fragment[annotation_id] = lyric_fragment
                    
    print("Processed {} annotations".format(len(annotation_to_text)))
    return (annotation_to_song,song_to_name,annotation_to_text,annotation_to_fragment)

In [5]:
#Create used variables and dictionaries
annotation_to_song,song_to_name,annotation_to_text,annotation_to_fragment = create_dictionarys()
vectorizer = TfidfVectorizer(max_features =  5000,
                           stop_words = "english",
                           max_df = 0.8, min_df = 10,
                          norm = 'l2')
tf_idf = vectorizer.fit_transform(list(annotation_to_text.values())).toarray()
index_to_annotation = {i:v for i, v in enumerate(vectorizer.get_feature_names())}
index_to_id = {i:v for i, v in enumerate(list(annotation_to_text.keys()))}

King Kunta by Kendrick Lamar
Processed 3887 annotations


In [8]:
def find_most_similar(query,n_results):
    """
    finds n most similar annotations to query
    """
    #Define used global variables
    global vectorizer, tf_idf, annotation_to_text, annotation_to_song, annotation_to_fragment,song_to_name

    #vectorize query
    query_vector = vectorizer.transform([query])    
    
    #find cosine similarities and the indices of related docs
    cosine_similarities = linear_kernel(query_vector, tf_idf).flatten()
    related_docs_indices = cosine_similarities.argsort()[-n_results:]
    
    
    #find highest similarity scores
    sim_scores = cosine_similarities[related_docs_indices] 
    
    #find ids of most similar annotations
    annotation_ids = [index_to_id[index] for index in related_docs_indices] #can later be used to find lyric fragment maybe
    
    
    #define output and input data
    output_array = [] #annotations sorted from most similar to least
    for i in range(1,n_results+1):
        _id = annotation_ids[-i]
        song_id = annotation_to_song[_id]
        data = {} #data object contains the below fields
        data["id"] = _id
        data["song"] = song_to_name[annotation_to_song[_id]]
        data["annotation"] = annotation_to_text[_id]
        data["lyric"] = annotation_to_fragment[_id]
        data["image"] = all_songs[song_id]["header_image_url"]
        print(all_songs[song_id])
        data["artist"] = all_songs[song_id]["artists_names"]
        if not all_songs[song_id]["album"] == None:
            data["album"] = all_songs[song_id]["album"]["full_title"]
        else:
            data["album"] = "No album found"
        data["similarity"] = sim_scores[-i]
        output_array.append(data)
        
    print("Finished finding similar annotations for query: {}".format(query))
    return output_array

In [9]:
#Testing output
test_output = find_most_similar("ribbon in the sky",5) #finding 5 most similar annotations for the query 'racism'
for i,data in enumerate(test_output):
    print("{}).".format(i+1))
    print("\tSong: {}".format(data["song"]))
    print("\n\tLyric Fragment: {}".format(data["lyric"]))
    print("\n\tAnnotation: {}".format(data["annotation"]))
    print("\n\tSimilarity Score: {}".format(data["similarity"]))
    print("_"*125)

{'referents': [{'url': 'https://genius.com/1021940/Kendrick-lamar-blow-my-high-members-only/Im-sending-him-a-four-page-letter-and-i-enclosed-it-with-a-kiss-a-kiss-and-when-i-write-him-he-better-get-it-on-time', 'annotations': [{'id': 1021940, 'annotation': 'This is a sample from Aaliyah’s hit song ‘4 Page Letter," on her album One in a Million released in 1996.', 'votes_total': 71, 'verified': False}], 'lyric': "I'm sending him a four-page letter\n And I enclosed it with a kiss (a kiss)\n And when I write him, he better\n Get it on time", 'id': 1021940}, {'url': 'https://genius.com/962161/Kendrick-lamar-blow-my-high-members-only/Look-at-my-life-and-look-at-yours-get-some-ambition-why-you-bored', 'annotations': [{'id': 962161, 'annotation': 'Look at how great Kendrick life is, then look how crappy yours is. Kendrick is telling people to get some ambition in their lives so they can have a life like his.', 'votes_total': 88, 'verified': False}], 'lyric': 'Look at my life, and look at your

KeyError: 'artist_names'

In [186]:
#testing some recommendation stuff
def recommend_artists(keywords=None,current_artist=None):
    """
    Recommendation based on keywords and artists
    """
    #useful global variables
    global all_songs
    
    #song_id_scores to keep track of cumulative relevance for songs
    song_scores = {}

    #create song_id to artist_names dict
    song_to_artist = {}
    for song_id in all_songs:
        if song_id not in song_to_artist:
            song_to_artist[song_id] = all_songs[song_id]["artists_names"]
            song_scores[song_id] = 0
    artists_array = []
    for artists in song_to_artist.values():
        artists_array.append(artists)
    
    #If no starting data, return most popular artist
    if not keywords and not current_artist:
        return Counter(artists_array).most_common(5)

    
    if current_artist:
        
        
    if keywords:
        for k_word in keywords: #implying they're not stop words
            most_similar_songs = find_most_similar(query,len(all_songs))
            for i,song_data in enumerate(most_similar_songs):
                song_scores[song_data["id"]]+=i
                
recommend_artists()

[('Kendrick Lamar', 351)]