In [37]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel #same as cosine similarity
from collections import Counter
from datetime import datetime

In [17]:
#Global variables
annotation_to_song = {} # annotation_id as key and song_id as value
song_to_name = {} #song_id to name of song
annotation_to_text = {} #annotation_id to annotation text
annotation_to_fragment = {} #annotation_id to lyric fragment

with open('songs.json') as json_file:  
    all_songs = json.load(json_file)
    

In [18]:
keys = all_songs.keys()
for key in all_songs.keys():
    first_song = all_songs[key]
    print(first_song)
    break

{'release_date': '2016-12-21', 'pyongs_count': 1, 'full_title': 'Vice City (Freestyle) by\xa0Kendrick\xa0Lamar (Ft.\xa0Jay\xa0Rock)', 'artists_names': 'Kendrick Lamar', 'producer_artists': [{'image_url': 'https://assets.genius.com/images/default_avatar_300.png?1554911127', 'is_verified': False, 'id': 1023818, 'is_meme_verified': False, 'api_path': '/artists/1023818', 'header_image_url': 'https://assets.genius.com/images/default_avatar_300.png?1554911127', 'name': 'Cardo & Yung Exclusive', 'url': 'https://genius.com/artists/Cardo-and-yung-exclusive'}], 'description': '?', 'title': 'Vice City (Freestyle)', 'writer_artists': [{'image_url': 'https://images.genius.com/e1cbef603f6c11cee182adcd55a78a87.421x421x1.jpg', 'is_verified': True, 'id': 1403, 'is_meme_verified': False, 'api_path': '/artists/1403', 'header_image_url': 'https://images.genius.com/342a36ab4005da9cfd3b0292f7a748a1.1000x600x1.jpg', 'name': 'Jay Rock', 'url': 'https://genius.com/artists/Jay-rock', 'iq': 11993}, {'image_url':

In [19]:
def create_dictionarys(json_file="songs.json", annotation_to_song={}, song_to_name={},
                       annotation_to_text={}, annotation_to_fragment={},
                      ):
    """
    Using songs.json as json_data
    Creates annotation dictionary: {annotation_id:[song_id,fragment/text,annotation_text]}
    """
    
    #load song json file
    with open(json_file) as song_json: 
        songs = json.load(song_json)
        
        #iterate through all songs and input data accordingly
        for song_id in songs:
            song_data = songs[song_id]
            
            if song_id not in song_to_name:
                song_to_name[song_id] = song_data["full_title"]
                if "king kunta" in song_data["full_title"].lower():
                    print(song_data["full_title"])
            
            #process annotations
            for referent in song_data["referents"]:
                lyric_fragment = referent["lyric"]
                for annotation in referent["annotations"]:
                    annotation_id = annotation["id"]
                    annotation_text = annotation["annotation"]
                
                    annotation_votes = annotation["votes_total"] # here is where we would record vote numbers

                    if annotation_votes >= 1:
                        if annotation_id not in annotation_to_song:
                            annotation_to_song[annotation_id] = song_id

                        if annotation_id not in annotation_to_text:
                            annotation_to_text[annotation_id] = annotation_text

                        if annotation_id not in annotation_to_fragment:
                            annotation_to_fragment[annotation_id] = lyric_fragment
                    
    print("Processed {} annotations".format(len(annotation_to_text)))
    return (annotation_to_song,song_to_name,annotation_to_text,annotation_to_fragment)

In [20]:
#Create used variables and dictionaries
annotation_to_song,song_to_name,annotation_to_text,annotation_to_fragment = create_dictionarys()
vectorizer = TfidfVectorizer(max_features =  5000,
                           stop_words = "english",
                           max_df = 0.8, min_df = 10,
                          norm = 'l2')
tf_idf = vectorizer.fit_transform(list(annotation_to_text.values())).toarray()
index_to_annotation = {i:v for i, v in enumerate(vectorizer.get_feature_names())}
index_to_id = {i:v for i, v in enumerate(list(annotation_to_text.keys()))}

King Kunta by Kendrick Lamar
Processed 3887 annotations


In [71]:
tt = "2017-05-12"
tt2 = "2012-10-20"
d = datetime.strptime(tt, "%Y-%m-%d")
d2 = datetime.strptime(tt, "%Y-%m-%d")
print(d<d2)

False


In [100]:
def find_most_similar(query,n_results,genres=[],start=None,end=None):
    """
    dates = yyyy-mm-dd
    finds n most similar annotations to query
    """
    #Define used global variables
    global vectorizer, tf_idf, annotation_to_text, annotation_to_song, annotation_to_fragment,song_to_name,all_songs

    #vectorize query
    query_vector = vectorizer.transform([query])    
    
    #find cosine similarities and the indices of related docs
    cosine_similarities = linear_kernel(query_vector, tf_idf).flatten()
    
    #Filter by genres TODO
#     if len(genres)>0:
#         for i,song_id in enumerate(all_songs.keys()):
#             if all_songs[song_id]["genres"] not in genres:
#                 cosine_similarities[i] = 0
                
    #Filter by time
    
    #Convert times to datetime objects
    if start:
        start = datetime.strptime(start,"%Y-%m-%d") 
    if end:
        end = datetime.strptime(end,"%Y-%m-%d") 

    invalid_songs = set() #valid song_ids
    if start or end:
        for i,song_id in enumerate(all_songs.keys()):
            #turn release_date string into datetme object, if it exists
            release_date_str = all_songs[song_id]["release_date"]
            if release_date_str:
                release_date = datetime.strptime(release_date_str,"%Y-%m-%d")     
                
                #invalidate songs outside of date range
                if (start and release_date<start) or (end and release_date>end):
                    invalid_songs.add(song_id)                    
            else:
                invalid_songs.add(song_id)
    
    #set scores of invalid songs to 0
    for i,annotation in enumerate(annotation_to_song):
        if annotation_to_song[annotation] in invalid_songs:
            cosine_similarities[i] = 0
            
    related_docs_indices = cosine_similarities.argsort()[-n_results:]
    
    
    #find highest similarity scores
    sim_scores = cosine_similarities[related_docs_indices] 
    
    #find ids of most similar annotations
    annotation_ids = [index_to_id[index] for index in related_docs_indices] #can later be used to find lyric fragment maybe
    
    
    #define output and input data
    output_array = [] #annotations sorted from most similar to least
    for i in range(1,n_results+1):
        if sim_scores[-i]>0:
            _id = annotation_ids[-i]
            song_id = annotation_to_song[_id]
            data = {} #data object contains the below fields
            data["id"] = _id
            data["song_id"] = song_id
            data["song"] = song_to_name[annotation_to_song[_id]]
            data["annotation"] = annotation_to_text[_id]
            data["lyric"] = annotation_to_fragment[_id]
            data["image"] = all_songs[song_id]["header_image_url"]
    #         print(all_songs[song_id])
            data["artist"] = all_songs[song_id]["artists_names"]
            if not all_songs[song_id]["album"] == None:
                data["album"] = all_songs[song_id]["album"]["full_title"]
            else:
                data["album"] = "No album found"
            data["similarity"] = sim_scores[-i]
            output_array.append(data)
        
    print("Finished finding similar annotations for query: {}".format(query))
    if len(output_array)==0:
        print("No Results Found")
    return output_array

In [103]:
#Testing output
test_output = find_most_similar("ribbon in the sky",5,start="2018-05-12",end="2017-07-07") #finding 5 most similar annotations for the query 'racism'
for i,data in enumerate(test_output):
    print("{}).".format(i+1))
    print("\tSong: {}".format(data["song"]))
    print("\tDate: {}".format(all_songs[data["song_id"]]["release_date"]))
    print("\n\tLyric Fragment: {}".format(data["lyric"]))
    print("\n\tAnnotation: {}".format(data["annotation"]))
    print("\n\tSimilarity Score: {}".format(data["similarity"]))
    print("_"*125)

Finished finding similar annotations for query: ribbon in the sky
No Results Found


In [23]:
#testing some recommendation stuff
def recommend_artists(keywords=None,current_artist=None):
    """
    Recommendation based on keywords and artists
    """
    #useful global variables
    global all_songs
    
    #song_id_scores to keep track of cumulative relevance for songs
    song_scores = {}

    #create song_id to artist_names dict
    song_to_artist = {}
    for song_id in all_songs:
        if song_id not in song_to_artist:
            song_to_artist[song_id] = all_songs[song_id]["artists_names"]
            song_scores[song_id] = 0
    artists_array = []
    for artists in song_to_artist.values():
        artists_array.append(artists)
    
    #If no starting data, return most popular artist
    if not keywords and not current_artist:
        return Counter(artists_array).most_common(5)

    
    if current_artist:
        
        
    if keywords:
        for k_word in keywords: #implying they're not stop words
            most_similar_songs = find_most_similar(query,len(all_songs))
            for i,song_data in enumerate(most_similar_songs):
                song_scores[song_data["id"]]+=i
                
recommend_artists()

IndentationError: expected an indented block (<ipython-input-23-17b79abf20b2>, line 30)