# Find the Most Representive songs of the singer

## Idea

1. Have lyrics of each singer, use that to build a VSM
2. Sort lyric vectors, find the most important terms of each song (like top 30 terms)
3. Get all the important terms of each song together, sort them by df, then choose the top [100] terms as the represent terms of this singer.
4. For each lyric vector, sum the representive terms' weight together, treat it as this song's score
5. Find the top 20 songs through the scores

## Build VSM For Each Singer

1. Matching singers and their songs


In [22]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/zgy/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [36]:
import numpy as np
import pandas as pd
import math
import re
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize


In [8]:
data = pd.read_csv('../mylyrics00.csv')

In [15]:
def _singer_song_dic(data):
    '''Build singer-song dictionay
    
    Dictionary format:
    {singer_name: {song_name: lyrics}}
    '''
    
    singer_songs = {}
    
    for i in range(len(data)):
        singer = data.iloc[i]['artist']
        song = data.iloc[i]['song']
        lyric = data.iloc[i]['lyrics']
        
        
        if singer not in singer_songs:
            temp = {song: lyric}
            singer_songs[singer] = temp
        else:
            singer_songs[singer][song] = lyric
            
    return singer_songs

In [18]:
# TESTING

singer_songs = _singer_song_dic(data)

In [39]:
# This function is tested
def _update_inverted_index(name, lyrics, inverted_index):
    '''Create inverted index, count doc vector length

    Read contents form file, remove punctuation and stopwords to get terms.
    Count tf of this doc, then update inverted index.
    
    inverted_index
    '''
    
    indices = {}
    punctuation = re.compile(r'[^\w\s\']')
    
    ###
    # Count term frequency
    ###
    stop_words = set(stopwords.words('english'))
    lyrics_split = re.sub(punctuation, ' ', lyrics.lower()).split()
    
    for term in lyrics_split:
        if term in stop_words:
            continue
        elif term in indices:
            indices[term] += 1
        else:
            indices[term] = 1
    
    ###
    # Update inverted_index
    ###
    for term, frequency in indices.items():
        if term in inverted_index:
            posting = inverted_index[term]
            posting.append((name, frequency))
            inverted_index[term] = posting
        else:
            inverted_index[term] = [(name, frequency)]

In [25]:
def _compute_weight(tf, df, n_songs):
    
    idf = math.log(n_songs / df)

    tf_normalized = 1 + math.log(tf)

    weight = tf_normalized * idf
    return weight

In [26]:
def _build_song_vector(inverted_index, n_songs):
    '''Build term-weight vector for songs
    
    Compute the tf-idf weight, {term: weight}
    
    return:
        dic: a dictionary which format is {song: {term: weight}}
    '''
    
    song_vectors = {}
    
    for term, posting in inverted_index.items():
        df = len(posting)
        for pair in posting:
            song, tf = pair
            weight = _compute_weight(tf, df, n_songs)
            if song not in song_vectors:  # Create song vec
                song_vectors[song] = {term: weight}
            else:
                song_vectors[song][term] = weight  # Add new term into vec
                
    return song_vectors

In [74]:
def _get_top_songs(inverted_index, terms):
    '''Count rep terms' frequency, use that to pick rep songs
    
    Args:
        inverted_index: inverted index of this singer's songs
            format => {term: [(song, tf),]}
        terms: this singer's representive terms
    Return:
        top_songs(list): a list that contains the name of songs, order by score
    '''
    
    song_scores = {}
    for term in terms:
        posting = inverted_index[term]
        for song, _ in posting:
            if song in song_scores:
                song_scores[song] += 1
            else:
                song_scores[song] = 1
                
    top_songs = sorted(song_scores.items(), key=lambda kv: kv[1], reverse=True)
    return top_songs

In [72]:
def _find_rep_songs(songs, num_of_songs, num_of_terms):
    '''Find the most representive songs of this singer
    
    Choose songs by compare the scores which is sum the weight of each important term in that song.
    
    Args:
        songs(dic): a dictionary {song: lyrics}
        num_of_songs: number of representive songs you want to choose
        num_of_terms: number of important words we want to score the songs
        
    Return:
        rep_songs(dic): a dictionary {song: lyrics}, size will less or equal to num_of_songs
    '''
    
    inverted_index = {}
    
    n_songs = len(songs)
    if n_songs <= num_of_songs:  # Do not need to choose if not has enough songs
        return songs
    
    # Build inverted index
    for name, lyrics in songs.items():
        _update_inverted_index(name, lyrics, inverted_index)
        
    # build song vector
    song_vectors = _build_song_vector(inverted_index, n_songs)
    
    # sort by weight, get top words (put them into a set)
    selected_terms = set()
    
    for song, vector in song_vectors.items():
        sorted_v = sorted(vector.items(), key=lambda kv: kv[1], reverse=True)
        select_range = sorted_v[:num_of_terms + 1]
        for term, _ in select_range:
            selected_terms.add(term)
    
    # get score of each song
    song_sorted = _get_top_songs(inverted_index, selected_terms)
    print(song_sorted)
    # get rep songs
    

In [75]:
# TESTING

_find_rep_songs(singer_songs['eminem'], 20, 10)



In [None]:
def get_rep_songs(num_of_songs, file_path):
    '''Get each singer's representive songs
    
    Args: 
        num_of_songs: number of songs you want to get from each singer
        file_path: path of input data file
    Return:
        dic: A dictionary which format is {singer, {song: lyrics}}
    '''
    
    # Read data from file
    # singer_songs = singer_song_dic(data)
    
    representive_songs = {}  # return dic
    for singer, songs in singer_songs.items():
        rep = find_rep_songs(songs)
        representive_songs[singer] = rep
        
    return representive_songs

In [None]:



def search():
    '''Search query matched documents

    Compute the similarity between query and documents.
    Print out related documents in decrease score order
    '''

    query = input('Please entry your query (use Ctrl-C to exit): ')
    # query preprocessing
    scores = {}
    for term in re.sub(punctuation, ' ', query).split():
        if term not in inverted_index:
            continue
        weight = compute_weight(1, len(inverted_index[term]))
        compute_score(term, weight, scores)

    for id in scores.keys():
        weight_production = scores[id]
        scores[id] = weight_production / length_of_docs[id]

    sorted_docs = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
    if (not sorted_docs):
        print(f'\nThere is no relevant document.')
    else:
        print('\nSearch results are: ')
        for doc in sorted_docs:
            print('-> ' + docs_list[doc[0]])
    print()

    print('Scores are:')
    for score in sorted_docs:
        print(score)
    print()


#
# Helper functions
#
def update_inverted_index(doc, doc_id):
    '''Create inverted index, count doc vector length

    Read contents form file, remove punctuation and stopwords to get terms.
    Count tf of this doc, then update inverted index.
    '''
    ###
    # Count term frequency
    ###
    doc_path = 'text/' + doc.strip()
    indices = {}
    with open(doc_path, 'r') as f:
        raw_line = f.readline()
        while(raw_line):
            line = raw_line.strip()
            if (not line):
                raw_line = f.readline()
                continue
            for word in re.sub(punctuation, ' ', line).split():
                word_low = word.lower()
                if word_low in stopwords:
                    continue
                elif word_low in indices:
                    indices[word_low] += 1
                else:
                    indices[word_low] = 1
            raw_line = f.readline()

    ###
    # Update inverted_index
    ###
    for pair in indices.items():
        term, frequency = pair
        if term in inverted_index:
            posting = inverted_index[term]
            posting.append((doc_id, frequency))
            inverted_index[term] = posting
        else:
            inverted_index[term] = [(doc_id, frequency)]


def compute_doc_length():
    num_of_docs = len(docs_list)
    docs_weights = [[] for _ in range(num_of_docs)]

    # Compute weights
    for item in inverted_index.items():
        _, posting = item
        df = len(posting)
        for pair in posting:
            id, tf = pair
            docs_weights[id].append(compute_weight(tf, df))

    # Compute lengths
    for v_weight in docs_weights:
        v_sq = [x**2 for x in v_weight]
        length = math.sqrt(sum(v_sq))
        length_of_docs.append(length)


def compute_score(term, weight_tq, scores):
    posting = inverted_index[term]
    df = len(posting)
    for pair in posting:
        id, tf = pair
        weight_td = compute_weight(tf, df)

        if id in scores:
            s = scores[id]
            scores[id] = s + weight_td * weight_tq
        else:
            scores[id] = weight_td * weight_tq


def compute_weight(tf, df):
    num_of_docs = len(docs_list)
    idf = math.log(num_of_docs/df)

    tf_normalized = 1 + math.log(tf)

    weight = tf_normalized * idf
    return weight


if __name__ == '__main__':
    indexing()
    while(True):
        search()
