In [7]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel #same as cosine similarity
from collections import Counter

In [8]:
#Global variables
annotation_to_song = {} # annotation_id as key and song_id as value
song_to_name = {} #song_id to name of song
annotation_to_text = {} #annotation_id to annotation text
annotation_to_fragment = {} #annotation_id to lyric fragment

with open('songs.json') as json_file:  
    all_songs = json.load(json_file)

In [9]:
def create_dictionarys(json_file="songs.json", annotation_to_song={}, song_to_name={},
                       annotation_to_text={}, annotation_to_fragment={},
                      ):
    """
    Using songs.json as json_data
    Creates annotation dictionary: {annotation_id:[song_id,fragment/text,annotation_text]}
    """
    
    #load song json file
    with open(json_file) as song_json: 
        songs = json.load(song_json)
        
        #iterate through all songs and input data accordingly
        for song_id in songs:
            song_data = songs[song_id]
            
            if song_id not in song_to_name:
                song_to_name[song_id] = song_data["full_title"]
            
            #process annotations
            for referent in song_data["referents"]:
                lyric_fragment = referent["lyric"]
                for annotation in referent["annotations"]:
                    annotation_id = annotation["id"]
                    annotation_text = annotation["annotation"]
                #annotation_votes = referent["votes_total"] # here is where we would record vote numbers
                
                if annotation_id not in annotation_to_song:
                    annotation_to_song[annotation_id] = song_id
                    
                if annotation_id not in annotation_to_text:
                    annotation_to_text[annotation_id] = annotation_text
                    
                if annotation_id not in annotation_to_fragment:
                    annotation_to_fragment[annotation_id] = lyric_fragment
                    
    print("Processed {} annotations".format(len(annotation_to_text)))
    return (annotation_to_song,song_to_name,annotation_to_text,annotation_to_fragment)

In [10]:
#Create used variables and dictionaries
annotation_to_song,song_to_name,annotation_to_text,annotation_to_fragment = create_dictionarys()
vectorizer = TfidfVectorizer(max_features =  5000,
                           stop_words = "english",
                           max_df = 0.8, min_df = 10,
                          norm = 'l2')
tf_idf = vectorizer.fit_transform(list(annotation_to_text.values())).toarray()
index_to_annotation = {i:v for i, v in enumerate(vectorizer.get_feature_names())}
index_to_id = {i:v for i, v in enumerate(list(annotation_to_text.keys()))}

Processed 4639 annotations


In [13]:
def find_most_similar(query,n_results,genres=[],start=None,end=None):
    """
    dates = yyyy-mm-dd
    finds n most similar annotations to query
    """
    #Define used global variables
    global vectorizer, tf_idf, annotation_to_text, annotation_to_song, annotation_to_fragment,song_to_name,all_songs

    #vectorize query
    query_vector = vectorizer.transform([query])    
    
    #find cosine similarities and the indices of related docs
    cosine_similarities = linear_kernel(query_vector, tf_idf).flatten()
    
    #Filter by genres TODO
#     if len(genres)>0:
#         for i,song_id in enumerate(all_songs.keys()):
#             if all_songs[song_id]["genres"] not in genres:
#                 cosine_similarities[i] = 0
                
    #Filter by time
    
    #Convert times to datetime objects
    if start:
        start = datetime.strptime(start,"%Y-%m-%d") 
    if end:
        end = datetime.strptime(end,"%Y-%m-%d") 

    invalid_songs = set() #valid song_ids
    if start or end:
        for i,song_id in enumerate(all_songs.keys()):
            #turn release_date string into datetme object, if it exists
            release_date_str = all_songs[song_id]["release_date"]
            if release_date_str:
                release_date = datetime.strptime(release_date_str,"%Y-%m-%d")     
                
                #invalidate songs outside of date range
                if (start and release_date<start) or (end and release_date>end):
                    invalid_songs.add(song_id)                    
            else:
                invalid_songs.add(song_id)
    
    #set scores of invalid songs to 0
    for i,annotation in enumerate(annotation_to_song):
        if annotation_to_song[annotation] in invalid_songs:
            cosine_similarities[i] = 0
            
    related_docs_indices = cosine_similarities.argsort()[-n_results:]
    
    
    #find highest similarity scores
    sim_scores = cosine_similarities[related_docs_indices] 
    
    #find ids of most similar annotations
    annotation_ids = [index_to_id[index] for index in related_docs_indices] #can later be used to find lyric fragment maybe
    
    
    #define output and input data
    output_array = [] #annotations sorted from most similar to least
    for i in range(1,n_results+1):
        if sim_scores[-i]>0:
            _id = annotation_ids[-i]
            song_id = annotation_to_song[_id]
            data = {} #data object contains the below fields
            data["id"] = _id
            data["song_id"] = song_id
            data["song"] = song_to_name[annotation_to_song[_id]]
            data["annotation"] = annotation_to_text[_id]
            data["lyric"] = annotation_to_fragment[_id]
            data["image"] = all_songs[song_id]["header_image_url"]
    #         print(all_songs[song_id])
            data["artist"] = all_songs[song_id]["artists_names"]
            if not all_songs[song_id]["album"] == None:
                data["album"] = all_songs[song_id]["album"]["full_title"]
            else:
                data["album"] = "No album found"
            data["similarity"] = sim_scores[-i]
            output_array.append(data)
        
    print("Finished finding similar annotations for query: {}".format(query))
    if len(output_array)==0:
        print("No Results Found")
    return output_array

In [14]:
#Testing output
test_output = find_most_similar("racism",5) #finding 5 most similar annotations for the query 'racism'
for i,data in enumerate(test_output):
    print("{}).".format(i+1))
    print("\tSong: {}".format(data["song"]))
    print("\n\tLyric Fragment: {}".format(data["lyric"]))
    print("\n\tAnnotation: {}".format(data["annotation"]))
    print("\n\tSimilarity Score: {}".format(data["similarity"]))
    print("_"*125)

Finished finding similar annotations for query: racism
1).
	Song: Heaven and Hell by Kendrick Lamar (Ft. Alori Joh)

	Lyric Fragment: Police brutality
 Kickin' ass of us Africans,

	Annotation: Over the years the LAPD have been accused of racism, brutality and corruuption, leading to the 1992 riots

	Similarity Score: 0.38293898173831237
_____________________________________________________________________________________________________________________________
2).
	Song: The Blacker the Berry by Kendrick Lamar

	Lyric Fragment: Excuse my French but fuck you — no, fuck y'all
 That's as blunt as it gets, I know you hate me, don't you?
 You hate my people, I can tell cause it's threats when I see you

	Annotation: Kendrick uses the expression “Excuse my French” to mock the audacity of racists in America. The intention of the phrase is to tone down and pardon  vulgarity – Kendrick’s usage in this context applies to how racists attempt to tone down and turn a blind eye to the prevalence of

In [None]:
# #testing some recommendation stuff
# def recommend_artists(keywords=None,current_artist=None):
#     """
#     Recommendation based on keywords and artists
#     """
#     #useful global variables
#     global all_songs
    
#     #song_id_scores to keep track of cumulative relevance for songs
#     song_scores = {}

#     #create song_id to artist_names dict
#     song_to_artist = {}
#     for song_id in all_songs:
#         if song_id not in song_to_artist:
#             song_to_artist[song_id] = all_songs[song_id]["artists_names"]
#             song_scores[song_id] = 0
#     artists_array = []
#     for artists in song_to_artist.values():
#         artists_array.append(artists)
    
#     #If no starting data, return most popular artist
#     if not keywords and not current_artist:
#         return Counter(artists_array).most_common(5)

    
#     if current_artist:
        
        
#     if keywords:
#         for k_word in keywords: #implying they're not stop words
#             most_similar_songs = find_most_similar(query,len(all_songs))
#             for i,song_data in enumerate(most_similar_songs):
#                 song_scores[song_data["id"]]+=i
                
# recommend_artists()

### Spacy Testing

In [25]:
import spacy
from collections import Counter

t1 = nlp(test_annots[0])
# doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")

for ent in t1.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

TDE 9 12 ORG
Kendrick 46 54 ORG
2012 57 61 DATE
Dre 85 88 PERSON
“The Recipe. 96 108 WORK_OF_ART
Black Hippy 110 121 WORK_OF_ART


In [61]:
#Define variables
entity_weight = {} #entity text to weight
entity_type = {} #entity text to type
annotation_to_ent_list = {} #annotation_id to list of entity objects

#define variables for processing
nlp = spacy.load("en_core_web_sm")

#Update Variables
for annotation_id in annotation_to_text:
    annotation = annotation_to_text[annotation_id]
    processed_text = nlp(annotation)
    ent_list = []
    for entity in processed_text.ents:
        entity_type[entity.text] = entity.label_
        entity_weight[entity.text] = 1
        ent_list.append(entity)
    annotation_to_ent_list[annotation_id] = ent_list

In [77]:
#Weighting to make people,organizations, work of art, date more important
for entity in entity_weight: #weighting by entity, maybe can do by entity type
    if entity_type[entity] == 'PERSON' or entity_type[entity] == 'ORG' or entity_type[entity] == 'GPE':
        entity_weight[entity] = 5
    if entity_type[entity] == 'DATE':
        entity_weight[entity] = 2
    if entity_type[entity]=='WORK_OF_ART':
        entity_weight[entity] = 4

In [80]:
print(Counter(entity_type.values()))
print(entity_type["Kendrick"])

Counter({'PERSON': 1989, 'ORG': 1660, 'WORK_OF_ART': 886, 'DATE': 754, 'GPE': 488, 'CARDINAL': 300, 'NORP': 171, 'LOC': 128, 'PRODUCT': 127, 'TIME': 123, 'FAC': 119, 'EVENT': 111, 'MONEY': 96, 'LAW': 84, 'QUANTITY': 69, 'ORDINAL': 46, 'PERCENT': 25, 'LANGUAGE': 4})
GPE


In [103]:
#Possible scoring method
def shared_entities_score(ent_list_1,ent_list_2):
    """
    Summation by weight
    """
    global entity_weight
    total = 0
    for entity in ent_list_1:
        if entity in ent_list_2:
            total+=entity_weight[entity.text]
    return total

def find_similar_by_entity(query,n_results):
    """
    Search for annotations based on entities
    """
    global annotation_to_ent_list,annotation_to_text
    annot_to_ent = annotation_to_ent_list
    
    query_ent_list = [entity for entity in nlp(query).ents] #should find way to handle unseen entities
    scores = []
    for annotation in annot_to_ent:
        scores.append(shared_entities_score(annot_to_ent[annotation],query_ent_list))
    scores = np.array(scores)
    related_docs_indices = scores.argsort()[-n_results:]
    
    #find highest similarity scores
    sim_scores = scores[related_docs_indices] 
    
    #find similar annotations
    annotations = np.array(list(annot_to_ent.keys()))
    annotation_ids = annotations[related_docs_indices]
    
    
    #define output and input data
    output_array = [] #annotations sorted from most similar to least
    for i in range(1,n_results+1):
        if sim_scores[-i]>0:
            _id = annotation_ids[-i]
            song_id = annotation_to_song[_id]
            data = {} #data object contains the below fields
            data["id"] = _id
            data["song_id"] = song_id
            data["song"] = song_to_name[annotation_to_song[_id]]
            data["annotation"] = annotation_to_text[_id]
            data["lyric"] = annotation_to_fragment[_id]
            data["image"] = all_songs[song_id]["header_image_url"]
    #         print(all_songs[song_id])
            data["artist"] = all_songs[song_id]["artists_names"]
            if not all_songs[song_id]["album"] == None:
                data["album"] = all_songs[song_id]["album"]["full_title"]
            else:
                data["album"] = "No album found"
            data["similarity"] = sim_scores[-i]
            output_array.append(data)
        
    print("Finished finding similar annotations for query: {}".format(query))
    if len(output_array)==0:
        print("No Results Found")
    return output_array

In [105]:
for annotation_id in annotation_to_text:
    break
testq = annotation_to_text[annotation_id]
print(testq)
# find_similar_by_entity(testq,5)

A nod to TDE’s success as a label, as well as Kendrick’s 2012 collaboration with Dr. Dre titled “The Recipe.” Black Hippy featured on the remix.


In [107]:
#Testing output
test_output = find_similar_by_entity(testq,5) #finding 5 most similar annotations for the query 'racism'
for i,data in enumerate(test_output):
    print("{}).".format(i+1))
    print("\tSong: {}".format(data["song"]))
    print("\n\tLyric Fragment: {}".format(data["lyric"]))
    print("\n\tAnnotation: {}".format(data["annotation"]))
    print("\n\tSimilarity Score: {}".format(data["similarity"]))
    print("_"*125)

Finished finding similar annotations for query: A nod to TDE’s success as a label, as well as Kendrick’s 2012 collaboration with Dr. Dre titled “The Recipe.” Black Hippy featured on the remix.
1).
	Song: Vice City (Freestyle) by Kendrick Lamar (Ft. Jay Rock)

	Lyric Fragment: You see me on TV thinking "oh, TDE got the recipe"

	Annotation: A nod to TDE’s success as a label, as well as Kendrick’s 2012 collaboration with Dr. Dre titled “The Recipe.” Black Hippy featured on the remix.

	Similarity Score: 22
_____________________________________________________________________________________________________________________________
2).
	Song: ​u by Kendrick Lamar

	Lyric Fragment: [Verse 2: Kendrick Lamar]

	Annotation: In order to convey the emotion of this verse, Kendrick uses his tearful delivery, which he first utilized on “His Pain II.” This inflection may have been first inspired by Game’s from “Start From Scratch” on The Documentary.

Also, notice that Kendrick mispronounces his lin