In [49]:
import os
import sys
module_path = os.path.abspath(os.path.join('.'))
if module_path not in sys.path:
    sys.path.append(module_path)

import operator
import nltk
import pandas
import json
import math
from scipy.spatial import distance
from scipy import sparse
from pprint import pprint

In [2]:
with open('./scraper/example_output.json', 'r') as json_file:
    data = json.load(json_file)

data = [song for song in data if 'lyrics' in song.keys() and type(song['lyrics']) is str]

In [5]:
def index_field(key):
    for i, field in enumerate([song[key] for song in data if key in song.keys() and type(song[key]) is str]):
        tokens = nltk.word_tokenize(field)
        data[i]['tokens'] = tokens
        data[i]['frequency_list'] = get_frequency_list(tokens)

def get_vocabulary(corpus):
    tokens = nltk.word_tokenize(corpus)
    return set(tokens)

def get_corpus(key):
    return ' '.join([song[key] for song in data])
        
def get_frequency_list(tokens):
    return [(token, tokens.count(token), ) for token in tokens]

def get_inverted_index(vocabulary, document_data):
    result = {}
    for term in vocabulary:
        ii_entry = {}
        document_frequency = 0
        for document_id, document in enumerate(document_data):
            if term in document['tokens']:
                document_frequency += 1
            ii_entry[document_id] = document['tokens'].count(term)
        ii_entry['df'] = document_frequency
        result[term] = ii_entry
    return result

def get_vector(vocabulary, document_ii):
    return [document_ii[key[0]] if key[0] in document_ii else 0 for key in vocabulary]

index_field('lyrics')
lyrics_corpus = get_corpus('lyrics')
lyrics_vocabulary = get_vocabulary(lyrics_corpus)
lyrics_ii = get_inverted_index(lyrics_vocabulary, data)

print(lyrics_ii) 



{'corner': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 0, 18: 0, 19: 0, 20: 0, 21: 0, 22: 0, 23: 0, 24: 0, 25: 0, 26: 0, 27: 0, 28: 0, 29: 0, 30: 0, 31: 0, 32: 0, 33: 0, 34: 0, 35: 0, 36: 0, 37: 0, 38: 0, 39: 0, 40: 0, 41: 1, 42: 0, 43: 0, 44: 0, 45: 0, 46: 0, 47: 0, 48: 0, 49: 0, 50: 0, 51: 0, 52: 0, 53: 0, 54: 0, 55: 0, 56: 0, 57: 0, 58: 0, 59: 0, 60: 0, 61: 0, 62: 0, 63: 0, 64: 0, 65: 0, 66: 0, 67: 0, 68: 0, 69: 0, 70: 0, 71: 0, 72: 0, 73: 0, 74: 0, 75: 0, 76: 0, 77: 0, 78: 0, 79: 0, 80: 0, 81: 0, 'df': 1}, 'grown': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 0, 18: 0, 19: 0, 20: 0, 21: 0, 22: 0, 23: 0, 24: 0, 25: 0, 26: 2, 27: 0, 28: 0, 29: 0, 30: 0, 31: 0, 32: 0, 33: 0, 34: 0, 35: 0, 36: 0, 37: 0, 38: 0, 39: 0, 40: 0, 41: 0, 42: 0, 43: 0, 44: 0, 45: 0, 46: 0, 47: 0, 48: 0, 49: 0, 50: 0, 51: 0, 52: 0, 53: 0, 54: 0, 55: 0, 56: 0, 57: 0, 58: 0, 5

In [33]:
"""
Document
List of term frequencies
Weighted term list using tf-idf scoring for each document. This is a vector representing each document.
Create a vector for the query as well.
Calculate cosine similarities.
Score

Document frequency of a term, df_t is the number of documents containing that term

"""

def get_weight_vector(document_id, inverted_index):
    return [tf_idf(value[document_id], value['df'], len(value.keys()) - 1) for key, value in inverted_index.items()]

def get_query_weight_vector(query, inverted_index):
    query_tokens = nltk.word_tokenize(query)
    return [tf_idf(query_tokens.count(key), value['df'], len(value.keys()) - 1) for key, value in inverted_index.items()]

def tf_idf(tf, df, N):
    return tf * math.log(N / (1 + df), 2)

In [64]:
def score_query(query, inverted_index):
    score_list = []
    for document_id in range(len(data)):
        score_list.append(cosine_similarity(get_weight_vector(document_id, inverted_index), get_query_weight_vector(query, inverted_index)))
    return score_list
def score_query_scipy(query, inverted_index):
    document_matrix = [get_weight_vector(document_id, inverted_index) for document_id in range(len(data))]
    query_matrix = [get_query_weight_vector(query, inverted_index)]
    return [abs(-1 + dist) for dist in distance.cdist(query_matrix, document_matrix, metric='cosine')[0]]

def square_rooted(x):
    return math.sqrt(sum([a*a for a in x]))
 
def cosine_similarity(x, y):
    numerator = sum(a*b for a,b in zip(x,y))
    denominator = 1 + square_rooted(x)*square_rooted(y)
    return numerator/float(denominator)

In [65]:
print(score_query('take', lyrics_ii))
print(score_query_scipy('take', lyrics_ii))

[0.0, 0.0, 0.09519138431676541, 0.0, 0.0, 0.0, 0.0, 0.04665586888129643, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01589860327432971, 0.0, 0.0, 0.028667921058890194, 0.0, 0.014527688676022067, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.08730773310885832, 0.012619701201031593, 0.0595213056330395, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.023380476276249104, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.095388457501340618, 0.0, 0.0, 0.0, 0.0, 0.046703160765924734, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.015926080440970547, 0.0, 0.0, 0.02871258350918704, 0.0, 0.014550628090857942, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.087426064418483107, 0.012637007228009201, 0.059617575603655215, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.023410174609855394, 0.0, 0.0, 0.0, 0.0,

In [52]:
XA = sparse.csr_matrix([[0, 0, 1, 2, 0, 0, 5]])
XB = sparse.csr_matrix([[0, 0, 1, 2, 0, 0, 1], [0, 0, 1, 2, 0, 0, 5]])

distance.cdist(XA.todense(), XB.todense(), metric='cosine')

array([[ 0.25464401,  0.        ]])