In [1]:
import numpy as np
import pandas as pd
import math
import re
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize

In [2]:
def _singer_song_dic(data):
    '''Build singer-song dictionay
    
    Dictionary format:
    {singer_name: {song_name: lyrics}}
    '''
    
    singer_songs = {}
    
    for i in range(len(data)):
        singer = data.iloc[i]['artist']
        song = data.iloc[i]['song']
        lyric = data.iloc[i]['lyrics']
        
        
        if singer not in singer_songs:
            temp = {song: lyric}
            singer_songs[singer] = temp
        else:
            singer_songs[singer][song] = lyric
            
    return singer_songs

In [3]:
# This function is tested
def _update_inverted_index(name, lyrics, inverted_index):
    '''Create inverted index, count doc vector length

    Read contents form file, remove punctuation and stopwords to get terms.
    Count tf of this doc, then update inverted index.
    
    inverted_index
    '''
    
    indices = {}
    punctuation = re.compile(r'[^\w\s\']')
    
    ###
    # Count term frequency
    ###
    stop_words = set(stopwords.words('english'))
    lyrics_split = re.sub(punctuation, ' ', lyrics.lower()).split()
    
    for term in lyrics_split:
        if term in stop_words:
            continue
        elif term in indices:
            indices[term] += 1
        else:
            indices[term] = 1
    
    ###
    # Update inverted_index
    ###
    for term, frequency in indices.items():
        if term in inverted_index:
            posting = inverted_index[term]
            posting.append((name, frequency))
            inverted_index[term] = posting
        else:
            inverted_index[term] = [(name, frequency)]

In [4]:
def _compute_weight(tf, df, n_songs):
    
    idf = math.log(n_songs / df)

    tf_normalized = 1 + math.log(tf)

    weight = tf_normalized * idf
    return weight

In [5]:
def _build_song_vector(inverted_index, n_songs):
    '''Build term-weight vector for songs
    
    Compute the tf-idf weight, {term: weight}
    
    return:
        dic: a dictionary which format is {song: {term: weight}}
    '''
    
    song_vectors = {}
    
    for term, posting in inverted_index.items():
        df = len(posting)
        for pair in posting:
            song, tf = pair
            weight = _compute_weight(tf, df, n_songs)
            if song not in song_vectors:  # Create song vec
                song_vectors[song] = {term: weight}
            else:
                song_vectors[song][term] = weight  # Add new term into vec
                
    return song_vectors

In [6]:
def _get_top_songs(inverted_index, terms):
    '''Count rep terms' frequency, use that to pick rep songs
    
    Args:
        inverted_index: inverted index of this singer's songs
            format => {term: [(song, tf),]}
        terms: this singer's representive terms
    Return:
        top_songs(list): a list that contains the name of songs, order by score
    '''
    
    song_scores = {}
    for term in terms:
        posting = inverted_index[term]
        for song, _ in posting:
            if song in song_scores:
                song_scores[song] += 1
            else:
                song_scores[song] = 1
                
    top_songs = sorted(song_scores.items(), key=lambda kv: kv[1], reverse=True)
    top_songs = [name for name, _ in top_songs]
    return top_songs

In [7]:

def _find_rep_songs(songs, num_of_songs, num_of_terms):
    '''Find the most representive songs of this singer
    
    Choose songs by compare the scores which is sum the weight of each important term in that song.
    
    Args:
        songs(dic): a dictionary {song: lyrics}
        num_of_songs: number of representive songs you want to choose
        num_of_terms: number of important words we want to score the songs
        
    Return:
        rep_songs(dic): a dictionary {song: lyrics}, size will less or equal to num_of_songs
    '''
    
    inverted_index = {}
    
    n_songs = len(songs)
    if n_songs <= num_of_songs:  # Do not need to choose if not has enough songs
        return songs
    
    # Build inverted index
    for name, lyrics in songs.items():
        _update_inverted_index(name, lyrics, inverted_index)
        
    # build song vector
    song_vectors = _build_song_vector(inverted_index, n_songs)
    
    # sort by weight, get top words (put them into a set)
    selected_terms = set()
    
    for song, vector in song_vectors.items():
        sorted_v = sorted(vector.items(), key=lambda kv: kv[1], reverse=True)
        select_range = sorted_v[:num_of_terms + 1]
        for term, _ in select_range:
            selected_terms.add(term)
    
    # get score of each song
    song_sorted = _get_top_songs(inverted_index, selected_terms)
    selected_songs = song_sorted[:num_of_songs + 1]
    top_songs = {}
    for name, lyrics in songs.items():
        if name in selected_songs:
            top_songs[name] = lyrics
    
    return top_songs

In [8]:
def get_rep_songs(num_of_songs, num_of_terms, file_path):
    '''Get each singer's representive songs
    
    Args: 
        num_of_songs: number of songs you want to get from each singer
        file_path: path of input data file
    Return:
        dic: A dictionary which format is {singer, {song: lyrics}}
    '''
    
    # Read data from file
    data = pd.read_csv(file_path)
    singer_songs = _singer_song_dic(data)
    
    representive_songs = {}  # return dic
    for singer, songs in singer_songs.items():
        rep = _find_rep_songs(songs, num_of_songs, num_of_terms)
        representive_songs[singer] = rep
    
    with open('top_songs.pickle', 'wb') as handle:
        pickle.dump(representive_songs, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
        
    return representive_songs

In [9]:
# TESTing part
files = 'mylyrics00.csv'
rep_data = get_rep_songs(20, 50, files)

In [10]:
import pandas as pd

data = pd.read_csv('mylyrics00.csv')
singer = (list(rep_data.keys()))
print(singer)

lyrics = []
for i in range(len(singer)):
    sing = singer[i]
    temp = ""
    song = list(rep_data[sing].keys())
    for j in range(len(song)):
        temp = temp + " " + rep_data[sing][song[j]]
    lyrics.append(temp)
    

['eminem', 'ghostface-killah', '50-cent', 'dr-dre', 'drake', '2pac', 'beyonce-knowles', 'ariana-grande', 'chris-brown', 'glen-campbell', 'ciara', 'ed-sheeran', 'garth-brooks', 'billy-ray-cyrus', 'brad-paisley', 'bryan-adams', 'eddy-arnold', 'beatles', 'coldplay', 'the-calling', 'the-doors']


In [11]:
print(len(rep_data['eminem'].values()))

21


In [12]:
print(lyrics[0])

 [Intro: Eminem]
Welcome to Detroit
This is the BET, Shady 2.0 Cypher 2011
Myself, Slaughterhouse and Yelawolf
White Dawg get em
[Verse 1: Yelawolf]
Put these muthafuckas in a box and I send em away
Put em in a gray 'llac and pop the trunk
And throw em in the back, jack; ha, dig em a grave
Put a brick inside that Xerox when I print up a page
Moving keys I can relate, cause I live in the cage
I throw up the A, I take em to school
I give them a grade
An easy E for effort, that's WWA
White with an attitude
Alphabet soup is on my plate
All I got is Z's they sleeping on me
I can't get em awake
I spoon feed em a sound
In a room full of deceivers and clowns
Who believe in making it rain cause all they see is the clouds
And I watch from the couch of the VIP like a potato
With a bunch of meatheads, like fuck it
I'll just feed em a cow
Plenty of White boys to pick from this year
But before you pick a pepper, you better pick up your heater
Cause even Peter Piper could pick up a mic
But what it's 

In [13]:
print(len(lyrics[0]))

158806


In [14]:
from nltk.tokenize.regexp import regexp_tokenize


test_list = []
for i in range(len(lyrics)):
    list_a=[]
    temp=str(lyrics[i])
    temp=temp.lower()
    list_a=regexp_tokenize(temp, "[a-z]['a-z]*")
    test_list.append(list_a)

In [15]:
print(len(test_list))

21


In [16]:
print(test_list[0])

['intro', 'eminem', 'welcome', 'to', 'detroit', 'this', 'is', 'the', 'bet', 'shady', 'cypher', 'myself', 'slaughterhouse', 'and', 'yelawolf', 'white', 'dawg', 'get', 'em', 'verse', 'yelawolf', 'put', 'these', 'muthafuckas', 'in', 'a', 'box', 'and', 'i', 'send', 'em', 'away', 'put', 'em', 'in', 'a', 'gray', 'llac', 'and', 'pop', 'the', 'trunk', 'and', 'throw', 'em', 'in', 'the', 'back', 'jack', 'ha', 'dig', 'em', 'a', 'grave', 'put', 'a', 'brick', 'inside', 'that', 'xerox', 'when', 'i', 'print', 'up', 'a', 'page', 'moving', 'keys', 'i', 'can', 'relate', 'cause', 'i', 'live', 'in', 'the', 'cage', 'i', 'throw', 'up', 'the', 'a', 'i', 'take', 'em', 'to', 'school', 'i', 'give', 'them', 'a', 'grade', 'an', 'easy', 'e', 'for', 'effort', "that's", 'wwa', 'white', 'with', 'an', 'attitude', 'alphabet', 'soup', 'is', 'on', 'my', 'plate', 'all', 'i', 'got', 'is', "z's", 'they', 'sleeping', 'on', 'me', 'i', "can't", 'get', 'em', 'awake', 'i', 'spoon', 'feed', 'em', 'a', 'sound', 'in', 'a', 'room', 

In [17]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')
stopWords = list(set(stopwords.words('english')))
#print(stopWords)

for i in range(len(test_list)):
    temp = []
    #for j in range(len(lyrics[i])):
    for j in test_list[i]:
        #print(j)
        if j not in stopWords:
            temp.append(j)
            #print(temp)
    test_list[i] = temp

In [18]:
lyrics_term = []
for i in range(len(test_list)):
    temp = {}
    for j in range(len(test_list[i])):
        try:
            temp[test_list[i][j]] += 1
        except KeyError:
            temp[test_list[i][j]] = 1
    lyrics_term.append(temp)

In [19]:
all_term = []
for i in range(len(lyrics)):
    for j in range(len(test_list[i])):
        all_term.append(test_list[i][j])
all_term = set(all_term)
print(len(all_term))

14153


In [20]:
all_lyrics_term = []
for i in range(len(lyrics_term)):
    temp = {}
    for j in all_term:
        try:
            temp[j] = lyrics_term[i][j]
        except KeyError:
            temp[j] = 0
    all_lyrics_term.append(temp)

In [21]:
a = {}
for i in range(len(singer)):
    a[singer[i]] = all_lyrics_term[i]

In [22]:
df = pd.DataFrame(a, columns=singer)

In [23]:
df

Unnamed: 0,eminem,ghostface-killah,50-cent,dr-dre,drake,2pac,beyonce-knowles,ariana-grande,chris-brown,glen-campbell,...,ed-sheeran,garth-brooks,billy-ray-cyrus,brad-paisley,bryan-adams,eddy-arnold,beatles,coldplay,the-calling,the-doors
a',0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
a'rattlin',0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
a'ringing,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
aaaaah,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
aaaallll,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aae,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
aah,0,0,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,3,0,0,1
aahm,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
ab,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aback,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
term = list(df.T.columns)

In [25]:
a = df.T
original = a.copy()
a

Unnamed: 0,a',a'rattlin',a'ringing,aaaaah,aaaallll,aae,aah,aahm,ab,aback,...,zombie,zombies,zone,zones,zoo,zoom,zooming,zow,zulu,zz
eminem,0,0,0,0,0,0,0,0,2,0,...,2,0,0,0,0,2,1,0,0,0
ghostface-killah,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,11,0,1,0,1,0
50-cent,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dr-dre,0,0,0,0,0,0,3,0,0,0,...,0,0,1,0,0,0,0,0,0,0
drake,0,0,0,0,1,0,0,0,0,1,...,0,0,2,0,0,0,0,0,0,1
2pac,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,1,0,0
beyonce-knowles,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ariana-grande,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chris-brown,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
glen-campbell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
print(term)



In [27]:
from tqdm import tqdm_notebook
count2 = 0
for i in tqdm_notebook(term):
    count = 0
   
    for j in range(len(singer)):
        if a.iloc[j][i] != 0:
            count += 1
    if count > (len(singer) / 2):
        count2 += 1
        #a.drop_duplicates(i, keep=False)
        a = a.drop(i, axis=1)
print(count2)

HBox(children=(IntProgress(value=0, max=14153), HTML(value='')))


663


In [28]:
a

Unnamed: 0,a',a'rattlin',a'ringing,aaaaah,aaaallll,aae,aah,aahm,ab,aback,...,zombie,zombies,zone,zones,zoo,zoom,zooming,zow,zulu,zz
eminem,0,0,0,0,0,0,0,0,2,0,...,2,0,0,0,0,2,1,0,0,0
ghostface-killah,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,11,0,1,0,1,0
50-cent,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dr-dre,0,0,0,0,0,0,3,0,0,0,...,0,0,1,0,0,0,0,0,0,0
drake,0,0,0,0,1,0,0,0,0,1,...,0,0,2,0,0,0,0,0,0,1
2pac,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,1,0,0
beyonce-knowles,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ariana-grande,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chris-brown,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
glen-campbell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
from numpy import dot
from numpy.linalg import norm
import numpy as np
def cos_sim(A, B):
       return round(dot(A, B)/(norm(A)*norm(B)), 3)

In [30]:
test = a.copy()
a = a.T
a
#word_list=list(a.columns)
word_list=list(test.columns)

In [31]:
from operator import itemgetter
first_singer=[]
second_singer = []
for i in range(len(test.iloc[0])):
    first_singer.append([test.iloc[0][i],word_list[i]])
    second_singer.append([test.iloc[1][i],word_list[i]])
    
first_singer.sort(key=itemgetter(0),reverse=True)
second_singer.sort(key=itemgetter(0),reverse=True)

#b = np.asarray(a.iloc[:][singer[0]])
#c = np.asarray(a.iloc[:][singer[1]])

In [32]:
print(first_singer)






In [33]:
print(second_singer)






In [34]:
aaa = []
for i in range(len(singer)):
    aaa.append(np.asarray(a.iloc[:][singer[i]]))

In [35]:
cos_sim_a=[]

for i in range(len(singer)):
    list_sim=[]
    for j in range(i+1,len(singer)):
        list_sim.append(cos_sim(aaa[i],aaa[j]))
  
    cos_sim_a.append(list_sim)
for i in range(len(singer)):
    print(cos_sim_a[i])

[0.321, 0.316, 0.296, 0.235, 0.256, 0.116, 0.078, 0.298, 0.076, 0.14, 0.154, 0.112, 0.103, 0.124, 0.076, 0.08, 0.083, 0.069, 0.058, 0.095]
[0.472, 0.366, 0.276, 0.375, 0.146, 0.038, 0.327, 0.058, 0.141, 0.114, 0.074, 0.07, 0.084, 0.089, 0.058, 0.072, 0.06, 0.041, 0.095]
[0.392, 0.296, 0.439, 0.097, 0.056, 0.334, 0.048, 0.151, 0.079, 0.051, 0.07, 0.066, 0.096, 0.049, 0.052, 0.034, 0.048, 0.095]
[0.233, 0.378, 0.089, 0.039, 0.251, 0.033, 0.11, 0.077, 0.056, 0.069, 0.062, 0.09, 0.04, 0.043, 0.037, 0.033, 0.058]
[0.221, 0.094, 0.047, 0.235, 0.069, 0.135, 0.073, 0.057, 0.052, 0.073, 0.056, 0.044, 0.052, 0.029, 0.043, 0.036]
[0.102, 0.033, 0.249, 0.069, 0.11, 0.074, 0.057, 0.079, 0.055, 0.142, 0.058, 0.038, 0.031, 0.049, 0.065]
[0.045, 0.131, 0.025, 0.161, 0.049, 0.032, 0.029, 0.032, 0.036, 0.019, 0.026, 0.076, 0.016, 0.051]
[0.038, 0.071, 0.05, 0.075, 0.052, 0.048, 0.027, 0.023, 0.025, 0.041, 0.018, 0.018, 0.031]
[0.028, 0.195, 0.058, 0.031, 0.046, 0.046, 0.053, 0.029, 0.062, 0.031, 0.028, 

In [36]:
original = original.T
original_word_list=list(original.columns)

bbb = []
for i in range(len(singer)):
    bbb.append(np.asarray(original.iloc[:][singer[i]]))
cos_sim_b=[]

for i in range(len(singer)):
    list_sim=[]
    for j in range(i+1,len(singer)):
        list_sim.append(cos_sim(bbb[i],bbb[j]))
  
    cos_sim_b.append(list_sim)
for i in range(len(singer)):
    print(cos_sim_b[i])

[0.777, 0.801, 0.789, 0.833, 0.695, 0.692, 0.498, 0.828, 0.523, 0.648, 0.667, 0.471, 0.546, 0.689, 0.526, 0.474, 0.451, 0.484, 0.567, 0.438]
[0.773, 0.767, 0.742, 0.698, 0.622, 0.459, 0.766, 0.474, 0.599, 0.487, 0.436, 0.495, 0.587, 0.493, 0.413, 0.406, 0.415, 0.452, 0.438]
[0.79, 0.755, 0.765, 0.623, 0.478, 0.799, 0.44, 0.625, 0.537, 0.417, 0.495, 0.602, 0.521, 0.417, 0.382, 0.458, 0.509, 0.42]
[0.73, 0.735, 0.621, 0.454, 0.764, 0.443, 0.586, 0.539, 0.438, 0.467, 0.597, 0.514, 0.424, 0.391, 0.418, 0.509, 0.425]
[0.67, 0.629, 0.47, 0.797, 0.503, 0.605, 0.582, 0.411, 0.491, 0.627, 0.491, 0.433, 0.405, 0.422, 0.522, 0.429]
[0.567, 0.431, 0.693, 0.424, 0.551, 0.472, 0.405, 0.456, 0.538, 0.504, 0.414, 0.359, 0.423, 0.452, 0.426]
[0.537, 0.692, 0.493, 0.681, 0.54, 0.428, 0.508, 0.625, 0.592, 0.41, 0.476, 0.495, 0.562, 0.459]
[0.484, 0.509, 0.61, 0.473, 0.397, 0.499, 0.532, 0.539, 0.429, 0.422, 0.372, 0.552, 0.44]
[0.468, 0.656, 0.562, 0.409, 0.501, 0.623, 0.527, 0.412, 0.432, 0.449, 0.506, 

In [37]:
print(singer)

['eminem', 'ghostface-killah', '50-cent', 'dr-dre', 'drake', '2pac', 'beyonce-knowles', 'ariana-grande', 'chris-brown', 'glen-campbell', 'ciara', 'ed-sheeran', 'garth-brooks', 'billy-ray-cyrus', 'brad-paisley', 'bryan-adams', 'eddy-arnold', 'beatles', 'coldplay', 'the-calling', 'the-doors']


In [38]:
print(rep_data.keys())

dict_keys(['eminem', 'ghostface-killah', '50-cent', 'dr-dre', 'drake', '2pac', 'beyonce-knowles', 'ariana-grande', 'chris-brown', 'glen-campbell', 'ciara', 'ed-sheeran', 'garth-brooks', 'billy-ray-cyrus', 'brad-paisley', 'bryan-adams', 'eddy-arnold', 'beatles', 'coldplay', 'the-calling', 'the-doors'])
