In [163]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel #same as cosine similarity

In [112]:
#Global variables
annotation_to_song = {} # annotation_id as key and song_id as value
song_to_name = {} #song_id to name of song
annotation_to_text = {} #annotation_id to annotation text
annotation_to_fragment = {} #annotation_id to lyric fragment

In [149]:
def create_dictionarys(json_data, annotation_to_song={}, song_to_name={},
                       annotation_to_text={}, annotation_to_fragment={}):
    """
    Using songs.json as json_data
    Creates annotation dictionary: {annotation_id:[song_id,fragment/text,annotation_text]}
    """
    
    #load song json file
    with open("songs.json") as song_json: 
        songs = json.load(song_json)
        
        #iterate through all songs and input data accordingly
        for song_id in songs:
            song_data = songs[song_id]
            
            if song_id not in song_to_name:
                song_to_name[song_id] = song_data["full_title"]
            
            #process annotations
            for referent in song_data["referents"]:
                lyric_fragment = referent["lyric"]
                for annotation in referent["annotations"]:
                    annotation_id = annotation["id"]
                    annotation_text = annotation["annotation"]
                #annotation_votes = referent["votes_total"] # here is where we would record vote numbers
                
                if annotation_id not in annotation_to_song:
                    annotation_to_song[annotation_id] = song_id
                    
                if annotation_id not in annotation_to_text:
                    annotation_to_text[annotation_id] = annotation_text
                    
                if annotation_id not in annotation_to_fragment:
                    annotation_to_fragment[annotation_id] = lyric_fragment
                    
    print("Processed {} annotations".format(len(annotation_to_text)))
    return (annotation_to_song,song_to_name,annotation_to_text,annotation_to_fragment)

In [130]:
#Create used variables and dictionaries
annotation_to_song,song_to_name,annotation_to_text,annotation_to_fragment = create_dictionarys(temp_songs)
vectorizer = TfidfVectorizer(max_features =  5000,
                           stop_words = "english",
                           max_df = 0.8, min_df = 10,
                          norm = 'l2')
tf_idf = vectorizer.fit_transform(list(annotation_to_text.values())).toarray()
index_to_annotation = {i:v for i, v in enumerate(vectorizer.get_feature_names())}
index_to_id = {i:v for i, v in enumerate(list(annotation_to_text.keys()))}

Processed 4639 annotations


In [153]:
def find_most_similar(query,n_results):
    """
    finds n most similar annotations to query
    """
    #Define used global variables
    global vectorizer, tf_idf, annotation_to_text, annotation_to_song, annotation_to_fragment,song_to_name

    #vectorie query
    query_vector = vectorizer.transform([query])    
    
    #find cosine similarities and the indices of related docs
    cosine_similarities = linear_kernel(query_vector, tf_idf).flatten()
    related_docs_indices = cosine_similarities.argsort()[-n_results:]
    
    
    #find highest similarity scores
    sim_scores = cosine_similarities[related_docs_indices] 
    
    #find ids of most similar annotations
    annotation_ids = [index_to_id[index] for index in related_docs_indices] #can later be used to find lyric fragment maybe
    
    
    #define output and input data
    output_array = [] #annotations sorted from most similar to least
    for i in range(1,n_results+1):
        _id = annotation_ids[-i]
        data = {} #data object contains the below fields
        data["id"] = _id
        data["song"] = song_to_name[annotation_to_song[_id]]
        data["annotation"] = annotation_to_text[_id]
        data["lyric"] = annotation_to_fragment[_id]
        data["similarity"] = sim_scores[-i]
        output_array.append(data)
        
    print("Finished finding similar annotations for query: {}".format(query))
    return output_array

In [162]:
#Testing output
test_output = find_most_similar("racism",5) #finding 5 most similar annotations for the query 'racism'
for i,data in enumerate(test_output):
    print("{}).".format(i+1))
    print("\tSong: {}".format(data["song"]))
    print("\n\tLyric Fragment: {}".format(data["lyric"]))
    print("\n\tAnnotation: {}".format(data["annotation"]))
    print("\n\tSimilarity Score: {}".format(data["similarity"]))
    print("_"*125)

Finished finding similar annotations for query: racism
1).
	Song: Heaven and Hell by Kendrick Lamar (Ft. Alori Joh)

	Lyric Fragment: Police brutality
 Kickin' ass of us Africans,

	Annotation: Over the years the LAPD have been accused of racism, brutality and corruuption, leading to the 1992 riots

	Similarity Score: 0.38293898173831237
_____________________________________________________________________________________________________________________________
2).
	Song: The Blacker the Berry by Kendrick Lamar

	Lyric Fragment: Excuse my French but fuck you — no, fuck y'all
 That's as blunt as it gets, I know you hate me, don't you?
 You hate my people, I can tell cause it's threats when I see you

	Annotation: Kendrick uses the expression “Excuse my French” to mock the audacity of racists in America. The intention of the phrase is to tone down and pardon  vulgarity – Kendrick’s usage in this context applies to how racists attempt to tone down and turn a blind eye to the prevalence of