# Find the Most Representive songs of the singer

## Idea

1. Have lyrics of each singer, use that to build a VSM
2. Sort lyric vectors, find the most important terms of each song (like top 30 terms)
3. Get all the important terms of each song together, sort them by df, then choose the top [100] terms as the represent terms of this singer.
4. For each lyric vector, sum the representive terms' weight together, treat it as this song's score
5. Find the top 20 songs through the scores

In [58]:
import numpy as np
import pandas as pd
import math
import re
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize


In [59]:
def _singer_song_dic(data):
    '''Build singer-song dictionay
    
    Dictionary format:
    {singer_name: {song_name: lyrics}}
    '''
    
    singer_songs = {}
    
    for i in range(len(data)):
        singer = data.iloc[i]['artist']
        song = data.iloc[i]['song']
        lyric = data.iloc[i]['lyrics']
        
        
        if singer not in singer_songs:
            temp = {song: lyric}
            singer_songs[singer] = temp
        else:
            singer_songs[singer][song] = lyric
            
    return singer_songs

In [60]:
from nltk.stem.snowball import SnowballStemmer
def _update_inverted_index(name, lyrics, inverted_index):
    '''Create inverted index, count doc vector length

    Read contents form file, remove punctuation and stopwords to get terms.
    Count tf of this doc, then update inverted index.
    
    inverted_index
    '''
    
    indices = {}
    punctuation = re.compile(r'[^\w\s\']')
    
    ###
    # Count term frequency
    ###
    stop_words = set(stopwords.words('english'))
    #lyrics_split = re.sub(punctuation, ' ', lyrics.lower()).split()
    #TEST: tokenize
    lyrics_split = regexp_tokenize(lyrics.lower(), "[a-z]['a-z]*")
    
    stemmer = SnowballStemmer('english')
    
    
    for term in lyrics_split:
        term = stemmer.stem(term)
        if term in stop_words:
            continue
        elif term in indices:
            indices[term] += 1
        else:
            indices[term] = 1
    
    ###
    # Update inverted_index
    ###
    for term, frequency in indices.items():
        if term in inverted_index:
            posting = inverted_index[term]
            posting.append((name, frequency))
            inverted_index[term] = posting
        else:
            inverted_index[term] = [(name, frequency)]

In [61]:
def _compute_weight(tf, df, n_songs):
    
    idf = math.log(n_songs / df)

    tf_normalized = 1 + math.log(tf)

    weight = tf_normalized * idf
    return weight

In [62]:
def _build_song_vector(inverted_index, n_songs):
    '''Build term-weight vector for songs
    
    Compute the tf-idf weight, {term: weight}
    
    return:
        dic: a dictionary which format is {song: {term: weight}}
    '''
    
    song_vectors = {}
    
    for term, posting in inverted_index.items():
        df = len(posting)
        for pair in posting:
            song, tf = pair
            weight = _compute_weight(tf, df, n_songs)
            if song not in song_vectors:  # Create song vec
                song_vectors[song] = {term: weight}
            else:
                song_vectors[song][term] = weight  # Add new term into vec
                
    return song_vectors

In [63]:
def _get_top_songs(inverted_index, terms):
    '''Count rep terms' frequency, use that to pick rep songs
    
    Args:
        inverted_index: inverted index of this singer's songs
            format => {term: [(song, tf),]}
        terms: this singer's representive terms
    Return:
        top_songs(list): a list that contains the name of songs, order by score
    '''
    
    song_scores = {}
    for term in terms:
        posting = inverted_index[term]
        for song, _ in posting:
            if song in song_scores:
                song_scores[song] += 1
            else:
                song_scores[song] = 1
                
    top_songs = sorted(song_scores.items(), key=lambda kv: kv[1], reverse=True)
    top_songs = [name for name, _ in top_songs]
    return top_songs

In [64]:
def _find_rep_songs(singer, songs, num_of_songs, num_of_terms):
    '''Find the most representive songs of this singer
    
    Choose songs by compare the scores which is sum the weight of each important term in that song.
    
    Args:
        songs(dic): a dictionary {song: lyrics}
        num_of_songs: number of representive songs you want to choose
        num_of_terms: number of important words we want to score the songs
        
    Return:
        rep_songs(dic): a dictionary {song: lyrics}, size will less or equal to num_of_songs
    '''
    global singers_repr_terms
    
    inverted_index = {}
    
    n_songs = len(songs)
    if n_songs <= num_of_songs:  # Do not need to choose if not has enough songs
        num_of_songs = n_songs
    
    # Build inverted index
    for name, lyrics in songs.items():
        _update_inverted_index(name, lyrics, inverted_index)
        
    # build song vector
    song_vectors = _build_song_vector(inverted_index, n_songs)
    
    # sort by weight, get top words (put them into a set)
    selected_terms = set()
    
    for song, vector in song_vectors.items():
        sorted_v = sorted(vector.items(), key=lambda kv: kv[1], reverse=True)
        select_range = sorted_v[:num_of_terms]
        for term, _ in select_range:
            selected_terms.add(term)
    
    # Store this singer's representative terms
    # inverted index: term [song, frequency]
    singers_repr_terms[singer] = {}
    for term in selected_terms:
        posting = inverted_index[term]
        total_tf = 0
        for _, tf in posting:
            total_tf = total_tf + tf
        singers_repr_terms[singer][term] = total_tf
    
    # get score of each song
    song_sorted = _get_top_songs(inverted_index, selected_terms)
    selected_songs = song_sorted[:num_of_songs]
    top_songs = {}
    for name, lyrics in songs.items():
        if name in selected_songs:
            top_songs[name] = lyrics
    
    return top_songs

In [65]:
def get_rep_songs(num_of_songs, num_of_terms, file_path):
    '''Get each singer's representive songs
    
    Args: 
        num_of_songs: number of songs you want to get from each singer
        file_path: path of input data file
    Return:
        dic: A dictionary which format is {singer, {song: lyrics}}
    '''
    
    # Read data from file
    data = pd.read_csv(file_path)
    singer_songs = _singer_song_dic(data)
    
    representive_songs = {}  # return dic
    for singer, songs in singer_songs.items():
        rep = _find_rep_songs(singer, songs, num_of_songs, num_of_terms)
        representive_songs[singer] = rep
    
    with open('top_songs.pickle', 'wb') as handle:
        pickle.dump(representive_songs, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
        
    return representive_songs

In [66]:
if __name__ == '__main__':
    singers_repr_terms = {}
# TESTing part
files = 'mylyrics00.csv'
get_rep_songs(20, 50, files)
print(singers_repr_terms['eminem'])

{'yazi': 1, 'claustrophob': 2, 'tiffani': 2, 'texta': 1, 'mortician': 3, 'vers': 260, 'jerkin': 3, 'chromey': 1, 'quiet': 20, 'nowher': 29, 'yank': 4, 'nauseous': 7, 'ysidro': 3, 'voorhi': 2, 'whitey': 15, 'suppos': 92, 'depend': 7, 'support': 27, 'murdaaaaa': 12, 'hear': 337, 'swang': 2, 'flavour': 2, 'unstopp': 4, 'asham': 18, 'truth': 52, 'lemon': 6, 'leftier': 1, 'squatin': 1, "couple'a": 1, 'bruv': 1, 'despic': 4, 'killer': 62, 'controversi': 24, 'rueger': 1, 'leg': 107, 'drill': 8, 'administ': 4, 'sweepyhead': 2, 'halo': 3, 'smash': 33, 'sog': 1, 'younger': 2, 'titti': 9, 'cfc': 1, 'goooooo': 3, 'chainlink': 1, 'snug': 2, 'chin': 11, 'boogi': 12, 'molotov': 3, 'justin': 7, 'getcha': 5, 'hach': 2, 'conduct': 4, 'winner': 11, 'meet': 102, 'cypherin': 2, 'dorm': 1, 'non': 23, 'nigg': 13, 'ewww': 1, 'gullibl': 1, 'movin': 19, 'champ': 5, 'tyron': 1, 'oochi': 1, 'yer': 2, 'thumper': 1, 'shabazz': 2, 'fack': 33, 'spring': 7, 'grailer': 1, 'ess': 5, 'tablet': 7, 'courtney': 1, 'vacat': 