In [1]:
import numpy as np
import pandas as pd
import math
import re
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize
from nltk.stem import PorterStemmer
porter = PorterStemmer()


In [2]:
def _singer_song_dic(data):
    '''Build singer-song dictionay
    
    Dictionary format:
    {singer_name: {song_name: lyrics}}
    '''
    
    singer_songs = {}
    
    for i in range(len(data)):
        singer = data.iloc[i]['artist']
        song = data.iloc[i]['song']
        lyric = data.iloc[i]['lyrics']
        
        
        if singer not in singer_songs:
            temp = {song: lyric}
            singer_songs[singer] = temp
        else:
            singer_songs[singer][song] = lyric
            
    return singer_songs

In [3]:
# This function is tested
def _update_inverted_index(name, lyrics, inverted_index):
    '''Create inverted index, count doc vector length

    Read contents form file, remove punctuation and stopwords to get terms.
    Count tf of this doc, then update inverted index.
    
    inverted_index
    '''
    
    indices = {}
    punctuation = re.compile(r'[^\w\s\']')
    
    ###
    # Count term frequency
    ###
    stop_words = set(stopwords.words('english'))
    lyrics_split = re.sub(punctuation, ' ', lyrics.lower()).split()
    
    for term in lyrics_split:
        if term in stop_words:
            continue
        elif term in indices:
            indices[term] += 1
        else:
            indices[term] = 1
    
    ###
    # Update inverted_index
    ###
    for term, frequency in indices.items():
        if term in inverted_index:
            posting = inverted_index[term]
            posting.append((name, frequency))
            inverted_index[term] = posting
        else:
            inverted_index[term] = [(name, frequency)]

In [4]:
def _compute_weight(tf, df, n_songs):
    
    idf = math.log(n_songs / df)

    tf_normalized = 1 + math.log(tf)

    weight = tf_normalized * idf
    return weight

In [5]:
def _build_song_vector(inverted_index, n_songs):
    '''Build term-weight vector for songs
    
    Compute the tf-idf weight, {term: weight}
    
    return:
        dic: a dictionary which format is {song: {term: weight}}
    '''
    
    song_vectors = {}
    
    for term, posting in inverted_index.items():
        df = len(posting)
        for pair in posting:
            song, tf = pair
            weight = _compute_weight(tf, df, n_songs)
            if song not in song_vectors:  # Create song vec
                song_vectors[song] = {term: weight}
            else:
                song_vectors[song][term] = weight  # Add new term into vec
                
    return song_vectors

In [6]:
def _get_top_songs(inverted_index, terms):
    '''Count rep terms' frequency, use that to pick rep songs
    
    Args:
        inverted_index: inverted index of this singer's songs
            format => {term: [(song, tf),]}
        terms: this singer's representive terms
    Return:
        top_songs(list): a list that contains the name of songs, order by score
    '''
    
    song_scores = {}
    for term in terms:
        posting = inverted_index[term]
        for song, _ in posting:
            if song in song_scores:
                song_scores[song] += 1
            else:
                song_scores[song] = 1
                
    top_songs = sorted(song_scores.items(), key=lambda kv: kv[1], reverse=True)
    top_songs = [name for name, _ in top_songs]
    return top_songs

In [7]:

def _find_rep_songs(songs, num_of_songs, num_of_terms):
    '''Find the most representive songs of this singer
    
    Choose songs by compare the scores which is sum the weight of each important term in that song.
    
    Args:
        songs(dic): a dictionary {song: lyrics}
        num_of_songs: number of representive songs you want to choose
        num_of_terms: number of important words we want to score the songs
        
    Return:
        rep_songs(dic): a dictionary {song: lyrics}, size will less or equal to num_of_songs
    '''
    
    inverted_index = {}
    
    n_songs = len(songs)
    if n_songs <= num_of_songs:  # Do not need to choose if not has enough songs
        return songs
    
    # Build inverted index
    for name, lyrics in songs.items():
        _update_inverted_index(name, lyrics, inverted_index)
        
    # build song vector
    song_vectors = _build_song_vector(inverted_index, n_songs)
    
    # sort by weight, get top words (put them into a set)
    selected_terms = set()
    
    for song, vector in song_vectors.items():
        sorted_v = sorted(vector.items(), key=lambda kv: kv[1], reverse=True)
        select_range = sorted_v[:num_of_terms + 1]
        for term, _ in select_range:
            selected_terms.add(term)
    
    # get score of each song
    song_sorted = _get_top_songs(inverted_index, selected_terms)
    selected_songs = song_sorted[:num_of_songs + 1]
    top_songs = {}
    for name, lyrics in songs.items():
        if name in selected_songs:
            top_songs[name] = lyrics
    
    return top_songs

In [8]:
def get_rep_songs(num_of_songs, num_of_terms, file_path):
    '''Get each singer's representive songs
    
    Args: 
        num_of_songs: number of songs you want to get from each singer
        file_path: path of input data file
    Return:
        dic: A dictionary which format is {singer, {song: lyrics}}
    '''
    
    # Read data from file
    data = pd.read_csv(file_path)
    singer_songs = _singer_song_dic(data)
    
    representive_songs = {}  # return dic
    for singer, songs in singer_songs.items():
        rep = _find_rep_songs(songs, num_of_songs, num_of_terms)
        representive_songs[singer] = rep
    
    with open('top_songs.pickle', 'wb') as handle:
        pickle.dump(representive_songs, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
        
    return representive_songs

In [9]:
# TESTing part
files = 'mylyrics00.csv'
rep_data = get_rep_songs(20, 50, files)

In [10]:
import pandas as pd

data = pd.read_csv('mylyrics00.csv')
singer = (list(rep_data.keys()))
print(singer)

lyrics = []
for i in range(len(singer)):
    sing = singer[i]
    temp = ""
    song = list(rep_data[sing].keys())
    for j in range(len(song)):
        temp = temp + " " + rep_data[sing][song[j]]
    lyrics.append(temp)
    

['eminem', 'ghostface-killah', '50-cent', 'dr-dre', 'drake', '2pac', 'beyonce-knowles', 'ariana-grande', 'chris-brown', 'glen-campbell', 'ciara', 'ed-sheeran', 'garth-brooks', 'billy-ray-cyrus', 'brad-paisley', 'bryan-adams', 'eddy-arnold', 'beatles', 'coldplay', 'the-calling', 'the-doors']


In [11]:
print(len(rep_data['eminem'].values()))

21


In [12]:
print(len(lyrics[0]))

158806


In [13]:
porter.stem('going')

'go'

In [14]:
from nltk.tokenize.regexp import regexp_tokenize
from tqdm import tqdm_notebook

test_list = []
#for i in range(len(lyrics)):
for i in tqdm_notebook(lyrics):
    list_a=[]
    temp=str(i)
    temp=temp.lower()
    temp2 = ""
    #for j in range(len(temp)):
    for j in tqdm_notebook(temp.split(' ')):
        temp2 = temp2 + " " + porter.stem(j)
    list_a=regexp_tokenize(temp2, "[a-z]['a-z]*")
    test_list.append(list_a)

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, max=27835), HTML(value='')))

HBox(children=(IntProgress(value=0, max=14224), HTML(value='')))

HBox(children=(IntProgress(value=0, max=14839), HTML(value='')))

HBox(children=(IntProgress(value=0, max=14437), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17197), HTML(value='')))

HBox(children=(IntProgress(value=0, max=19492), HTML(value='')))

HBox(children=(IntProgress(value=0, max=11549), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8881), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13806), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4775), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10577), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13538), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5652), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5274), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6425), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5592), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5647), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5613), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4866), HTML(value='')))

HBox(children=(IntProgress(value=0, max=7411), HTML(value='')))




In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')
stopWords = list(set(stopwords.words('english')))
#print(stopWords)

for i in range(len(test_list)):
    temp = []
    #for j in range(len(lyrics[i])):
    for j in test_list[i]:
        #print(j)
        if j not in stopWords:
            temp.append(j)
            #print(temp)
    test_list[i] = temp

In [16]:
lyrics_term = []
for i in range(len(test_list)):
    temp = {}
    for j in range(len(test_list[i])):
        try:
            temp[test_list[i][j]] += 1
        except KeyError:
            temp[test_list[i][j]] = 1
    lyrics_term.append(temp)

In [17]:
all_term = []
for i in range(len(lyrics)):
    for j in range(len(test_list[i])):
        all_term.append(test_list[i][j])
all_term = set(all_term)
print(len(all_term))

13656


In [18]:
all_lyrics_term = []
for i in range(len(lyrics_term)):
    temp = {}
    for j in all_term:
        try:
            temp[j] = lyrics_term[i][j]
        except KeyError:
            temp[j] = 0
    all_lyrics_term.append(temp)

In [19]:
a = {}
for i in range(len(singer)):
    a[singer[i]] = all_lyrics_term[i]

In [20]:
df = pd.DataFrame(a, columns=singer)

In [21]:
df

Unnamed: 0,eminem,ghostface-killah,50-cent,dr-dre,drake,2pac,beyonce-knowles,ariana-grande,chris-brown,glen-campbell,...,ed-sheeran,garth-brooks,billy-ray-cyrus,brad-paisley,bryan-adams,eddy-arnold,beatles,coldplay,the-calling,the-doors
a',0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
a'rattlin',0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
a'ringing,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
aa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
aaaaah,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
aaaallll,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aah,0,0,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,3,0,0,1
aahm,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
ab,2,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
aback,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
term = list(df.T.columns)

In [23]:
a = df.T
original = a.copy()
a

Unnamed: 0,a',a'rattlin',a'ringing,aa,aaaaah,aaaallll,aah,aahm,ab,aback,...,zod,zoe,zombi,zone,zones,zoo,zoom,zow,zulu,zz
eminem,0,0,0,0,0,0,0,0,2,0,...,2,0,2,0,0,0,3,0,0,0
ghostface-killah,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,11,1,0,1,0
50-cent,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dr-dre,0,0,0,0,0,0,3,0,0,0,...,0,0,0,1,0,0,0,0,0,0
drake,0,0,0,0,0,1,0,0,0,1,...,0,1,0,2,0,0,0,0,0,1
2pac,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,1,0,0
beyonce-knowles,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ariana-grande,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chris-brown,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
glen-campbell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
from tqdm import tqdm_notebook
count2 = 0
for i in tqdm_notebook(term):
    count = 0
   
    for j in range(len(singer)):
        if a.iloc[j][i] != 0:
            count += 1
    if count > (len(singer) / 3):
        count2 += 1
        #a.drop_duplicates(i, keep=False)
        a = a.drop(i, axis=1)
print(count2)

HBox(children=(IntProgress(value=0, max=13656), HTML(value='')))


1119


In [25]:
a

Unnamed: 0,a',a'rattlin',a'ringing,aa,aaaaah,aaaallll,aah,aahm,ab,aback,...,zl,zod,zoe,zombi,zones,zoo,zoom,zow,zulu,zz
eminem,0,0,0,0,0,0,0,0,2,0,...,17,2,0,2,0,0,3,0,0,0
ghostface-killah,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,11,1,0,1,0
50-cent,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dr-dre,0,0,0,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
drake,0,0,0,0,0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
2pac,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
beyonce-knowles,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ariana-grande,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chris-brown,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
glen-campbell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
from numpy import dot
from numpy.linalg import norm
import numpy as np
def cos_sim(A, B):
       return round(dot(A, B)/(norm(A)*norm(B)), 3)

In [27]:
test = a.copy()
a = a.T
a
#word_list=list(a.columns)
word_list=list(test.columns)

In [28]:
from operator import itemgetter
first_singer=[]
second_singer = []
for i in range(len(test.iloc[0])):
    first_singer.append([test.iloc[0][i],word_list[i]])
    second_singer.append([test.iloc[1][i],word_list[i]])
    
first_singer.sort(key=itemgetter(0),reverse=True)
second_singer.sort(key=itemgetter(0),reverse=True)

#b = np.asarray(a.iloc[:][singer[0]])
#c = np.asarray(a.iloc[:][singer[1]])

In [29]:
aaa = []
for i in range(len(singer)):
    aaa.append(np.asarray(a.iloc[:][singer[i]]))

# remove 1/2

In [30]:
cos_sim_a=[]

for i in range(len(singer)):
    list_sim=[]
    for j in range(len(singer)):
        list_sim.append(cos_sim(aaa[i],aaa[j]))
  
    cos_sim_a.append(list_sim)
for i in range(len(singer)):
    print(cos_sim_a[i])

[1.0, 0.18, 0.172, 0.15, 0.123, 0.129, 0.06, 0.064, 0.095, 0.056, 0.048, 0.102, 0.074, 0.065, 0.08, 0.036, 0.058, 0.055, 0.059, 0.05, 0.072]
[0.18, 1.0, 0.208, 0.151, 0.074, 0.188, 0.067, 0.023, 0.088, 0.052, 0.063, 0.082, 0.054, 0.049, 0.065, 0.076, 0.048, 0.048, 0.052, 0.028, 0.077]
[0.172, 0.208, 1.0, 0.178, 0.09, 0.28, 0.055, 0.031, 0.081, 0.04, 0.044, 0.067, 0.04, 0.051, 0.05, 0.089, 0.043, 0.022, 0.029, 0.028, 0.048]
[0.15, 0.151, 0.178, 1.0, 0.059, 0.211, 0.056, 0.015, 0.054, 0.015, 0.042, 0.036, 0.04, 0.025, 0.027, 0.075, 0.026, 0.039, 0.032, 0.014, 0.03]
[0.123, 0.074, 0.09, 0.059, 1.0, 0.081, 0.039, 0.046, 0.1, 0.036, 0.044, 0.06, 0.03, 0.023, 0.038, 0.034, 0.02, 0.036, 0.019, 0.015, 0.025]
[0.129, 0.188, 0.28, 0.211, 0.081, 1.0, 0.04, 0.029, 0.057, 0.046, 0.042, 0.051, 0.045, 0.035, 0.031, 0.152, 0.044, 0.029, 0.023, 0.02, 0.069]
[0.06, 0.067, 0.055, 0.056, 0.039, 0.04, 1.0, 0.058, 0.042, 0.014, 0.04, 0.033, 0.021, 0.027, 0.022, 0.022, 0.015, 0.015, 0.028, 0.009, 0.035]
[0.0

# original(don't remove)

In [31]:
original = original.T
original_word_list=list(original.columns)

bbb = []
for i in range(len(singer)):
    bbb.append(np.asarray(original.iloc[:][singer[i]]))
cos_sim_b=[]

for i in range(len(singer)):
    list_sim=[]
    for j in range(len(singer)):
        list_sim.append(cos_sim(bbb[i],bbb[j]))
  
    cos_sim_b.append(list_sim)
for i in range(len(singer)):
    print(cos_sim_b[i])

[1.0, 0.782, 0.786, 0.807, 0.829, 0.704, 0.717, 0.528, 0.812, 0.584, 0.668, 0.685, 0.545, 0.635, 0.728, 0.536, 0.519, 0.525, 0.542, 0.621, 0.461]
[0.782, 1.0, 0.794, 0.792, 0.742, 0.719, 0.638, 0.475, 0.775, 0.516, 0.608, 0.51, 0.485, 0.536, 0.617, 0.49, 0.443, 0.458, 0.449, 0.491, 0.485]
[0.786, 0.794, 1.0, 0.821, 0.766, 0.785, 0.618, 0.476, 0.808, 0.463, 0.614, 0.534, 0.432, 0.511, 0.595, 0.485, 0.414, 0.407, 0.469, 0.526, 0.431]
[0.807, 0.792, 0.821, 1.0, 0.758, 0.767, 0.636, 0.469, 0.771, 0.487, 0.598, 0.552, 0.48, 0.538, 0.625, 0.502, 0.441, 0.436, 0.459, 0.543, 0.458]
[0.829, 0.742, 0.766, 0.758, 1.0, 0.693, 0.647, 0.502, 0.788, 0.537, 0.627, 0.604, 0.469, 0.558, 0.66, 0.493, 0.457, 0.46, 0.479, 0.581, 0.461]
[0.704, 0.719, 0.785, 0.767, 0.693, 1.0, 0.577, 0.443, 0.698, 0.447, 0.549, 0.476, 0.439, 0.488, 0.561, 0.479, 0.405, 0.378, 0.44, 0.485, 0.445]
[0.717, 0.638, 0.618, 0.636, 0.647, 0.577, 1.0, 0.56, 0.703, 0.538, 0.69, 0.563, 0.469, 0.545, 0.667, 0.586, 0.409, 0.499, 0.525, 

In [32]:
print(singer)

['eminem', 'ghostface-killah', '50-cent', 'dr-dre', 'drake', '2pac', 'beyonce-knowles', 'ariana-grande', 'chris-brown', 'glen-campbell', 'ciara', 'ed-sheeran', 'garth-brooks', 'billy-ray-cyrus', 'brad-paisley', 'bryan-adams', 'eddy-arnold', 'beatles', 'coldplay', 'the-calling', 'the-doors']


In [33]:
print(rep_data.keys())

dict_keys(['eminem', 'ghostface-killah', '50-cent', 'dr-dre', 'drake', '2pac', 'beyonce-knowles', 'ariana-grande', 'chris-brown', 'glen-campbell', 'ciara', 'ed-sheeran', 'garth-brooks', 'billy-ray-cyrus', 'brad-paisley', 'bryan-adams', 'eddy-arnold', 'beatles', 'coldplay', 'the-calling', 'the-doors'])


# cosine similarity with original data

# Max accuracy

In [34]:
for i in range(len(cos_sim_b)):
    test = cos_sim_b[i][:]
    test.pop(test.index(max(test)))
    a = cos_sim_b[i].index(max(test))
    print("" + singer[i] +"      "  + singer[a]+ " :   " + str(max(test)))

eminem      drake :   0.829
ghostface-killah      50-cent :   0.794
50-cent      dr-dre :   0.821
dr-dre      50-cent :   0.821
drake      eminem :   0.829
2pac      50-cent :   0.785
beyonce-knowles      eminem :   0.717
ariana-grande      ciara :   0.617
chris-brown      eminem :   0.812
glen-campbell      brad-paisley :   0.671
ciara      beyonce-knowles :   0.69
ed-sheeran      eminem :   0.685
garth-brooks      brad-paisley :   0.638
billy-ray-cyrus      brad-paisley :   0.702
brad-paisley      eminem :   0.728
bryan-adams      brad-paisley :   0.601
eddy-arnold      billy-ray-cyrus :   0.653
beatles      brad-paisley :   0.619
coldplay      the-calling :   0.605
the-calling      brad-paisley :   0.717
the-doors      brad-paisley :   0.552


# Min accuracy

In [39]:
for i in range(len(cos_sim_b)):
    a = cos_sim_b[i].index(min(cos_sim_b[i]))
    print("" + singer[i] +"/"  + singer[a]+ " :   " + str(min(cos_sim_b[i])))

eminem/the-doors :   0.461
ghostface-killah/eddy-arnold :   0.443
50-cent/beatles :   0.407
dr-dre/beatles :   0.436
drake/eddy-arnold :   0.457
2pac/beatles :   0.378
beyonce-knowles/eddy-arnold :   0.409
ariana-grande/the-doors :   0.395
chris-brown/eddy-arnold :   0.389
glen-campbell/2pac :   0.447
ciara/eddy-arnold :   0.429
ed-sheeran/the-doors :   0.373
garth-brooks/ed-sheeran :   0.409
billy-ray-cyrus/2pac :   0.488
brad-paisley/the-doors :   0.552
bryan-adams/eddy-arnold :   0.428
eddy-arnold/chris-brown :   0.389
beatles/2pac :   0.378
coldplay/ariana-grande :   0.409
the-calling/2pac :   0.485
the-doors/ed-sheeran :   0.373


# cosine similarity between removed data

# Max accuracy

In [36]:
for i in range(len(cos_sim_a)):
    test = cos_sim_a[i][:]
    test.pop(test.index(max(test)))
    a = cos_sim_a[i].index(max(test))
    print("" + singer[i] +"      "  + singer[a]+ " :   " + str(max(test)))

eminem      ghostface-killah :   0.18
ghostface-killah      50-cent :   0.208
50-cent      2pac :   0.28
dr-dre      2pac :   0.211
drake      eminem :   0.123
2pac      50-cent :   0.28
beyonce-knowles      ghostface-killah :   0.067
ariana-grande      ed-sheeran :   0.094
chris-brown      brad-paisley :   0.108
glen-campbell      eddy-arnold :   0.1
ciara      ghostface-killah :   0.063
ed-sheeran      eminem :   0.102
garth-brooks      billy-ray-cyrus :   0.115
billy-ray-cyrus      garth-brooks :   0.115
brad-paisley      chris-brown :   0.108
bryan-adams      2pac :   0.152
eddy-arnold      glen-campbell :   0.1
beatles      eminem :   0.055
coldplay      ed-sheeran :   0.072
the-calling      glen-campbell :   0.092
the-doors      ghostface-killah :   0.077


# Min accuracy

In [37]:
for i in range(len(cos_sim_a)):
    a = cos_sim_a[i].index(min(cos_sim_a[i]))
    print("" + singer[i] +"/"  + singer[a]+ " :   " + str(min(cos_sim_a[i])))

eminem/bryan-adams :   0.036
ghostface-killah/ariana-grande :   0.023
50-cent/beatles :   0.022
dr-dre/the-calling :   0.014
drake/the-calling :   0.015
2pac/the-calling :   0.02
beyonce-knowles/the-calling :   0.009
ariana-grande/eddy-arnold :   0.014
chris-brown/glen-campbell :   0.012
glen-campbell/ciara :   0.006
ciara/glen-campbell :   0.006
ed-sheeran/chris-brown :   0.03
garth-brooks/chris-brown :   0.017
billy-ray-cyrus/ciara :   0.014
brad-paisley/ariana-grande :   0.017
bryan-adams/ciara :   0.013
eddy-arnold/ciara :   0.012
beatles/coldplay :   0.014
coldplay/beatles :   0.014
the-calling/ciara :   0.007
the-doors/ciara :   0.018
