In [1]:
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
import numpy as np
import pandas as pd
import os
from copy import copy
from collections import OrderedDict
from scipy.stats import spearmanr
from tqdm import tqdm_notebook
import pickle

In [2]:
MODEL_PATH = '../models/dim_bow1000_5topics_firsttrack_songreleasedate'
BOW_DIR = '../data/features/bow_1000/'

In [3]:
# Load model
dim_model = DtmModel.load(MODEL_PATH)

In [4]:
# Load artist info
artists = pd.read_csv('../data/allmusic/artists_cleaned.csv')

In [5]:
# Load song info
songs = pd.read_csv('../data/artist_song_list_years_cleaned.csv')

In [6]:
# Load bow_path_by_artist
bow_path_by_artist = pickle.load(open('../models/bow500_path_by_artist.pk', 'rb'))

In [7]:
bow_path_by_artist_new = []

for (id, path, year) in bow_path_by_artist:
    bow_path_by_artist_new.append((id, BOW_DIR + path.split('/n/regal/rush_lab/xue/bow_500/')[1], year))

bow_path_by_artist = bow_path_by_artist_new

In [8]:
# # Add a column with a ".npy" extension
# songs['name_ext'] = songs['name_no_ext'].apply(lambda x: (x + '.npy')).str.decode('utf-8')

# # list of (artist_id, path, year) tuples
# bow_path_by_artist = []

# ids_in_songs = set(songs['artist_id'])

# for artist_id in tqdm_notebook(os.listdir(BOW_DIR)):
#     song_filename = os.listdir(BOW_DIR + artist_id)[0].decode('utf-8')
    
#     # Check if song year is missing or artist is missing from song df
#     if int(artist_id) in ids_in_songs and songs[songs['name_ext'] == song_filename]['year'].iloc[0] != 0:
#         # save (artist_id, path, year) tuple
#         bow_path_by_artist.append((int(artist_id), BOW_DIR + artist_id + '/', songs[songs['name_ext'] == song_filename]['year'].iloc[0]))

# print "Number of songs:", len(bow_path_by_artist)

# Order list by year
bow_path_by_artist.sort(key= lambda x: songs[songs['artist_id'] == x[0]]['year'].iloc[0])

# Create counter for number of songs for each year 
year_counter = {int(k) : 0 for k in np.unique(zip(*bow_path_by_artist)[2])}

for id, path, year in bow_path_by_artist:
    year_counter[year] += 1

# Lookup table for time_slice index v. year
time_slice_dict = {idx : year for (idx, year) in enumerate(sorted(year_counter))}
# List of counts for each time slice for DIM
time_seq = [year_counter[key] for key in sorted(year_counter.keys())]

print "Count of artists per each time slice:"
print time_seq

Count of artists per each time slice:
[1, 1, 1, 1, 3, 1, 1, 1, 5, 3, 2, 2, 1, 2, 2, 1, 1, 2, 2, 5, 2, 2, 7, 2, 5, 5, 5, 6, 4, 6, 11, 15, 30, 42, 37, 64, 44, 44, 42, 52, 62, 49, 84, 73, 100, 113, 117, 112, 94, 135, 103, 133, 126, 131, 112, 141, 152, 167, 155, 156, 160, 141, 164, 171, 172, 182, 175, 194, 211, 203, 266, 272, 324, 360, 361, 431, 361, 382, 410, 407, 444, 440, 407, 420, 406, 410, 318, 350, 272, 299, 377, 390, 277, 264, 188, 3]


# Get most influential songs for each topic

In [9]:
# Create (num_songs, num_topics) array of influence scores
song_topic_inf_matrix = []

for time_idx in range(len(dim_model.time_slices)):    
    for song_idx in range(dim_model.time_slices[time_idx]):
        song_inf_by_topic = [None] * dim_model.num_topics

        for topic_idx in range(dim_model.num_topics):
            song_inf_by_topic[topic_idx] = dim_model.influences_time[time_idx][song_idx][topic_idx]
        
        song_topic_inf_matrix.append(song_inf_by_topic)

In [10]:
# Get the indices of the most influential songs per topic
most_inf_idx = np.array(song_topic_inf_matrix).argmax(axis=0)

for topic_no, artist_idx in enumerate(most_inf_idx):
    print "Topic:", topic_no
    # Lookup artist name
    print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['name'].iloc[0]
    # Lookup name of sample
    print os.listdir(bow_path_by_artist[artist_idx][1])[0]
    
    print

Topic: 0
Don Chezina
0_Por Mi Reggae Muero.npy

Topic: 1
Big Jaz
0_Jigga WhatFaint.npy

Topic: 2
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy

Topic: 3
Sioux Falls
0_CopyPaste.npy

Topic: 4
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy



# Get most influential songs per topic per epoch

In [11]:
song_topic_inf_by_year = []
song_topic_inf_matrix_copy = copy(song_topic_inf_matrix)

# Slice song-topic matrix into list of submatrices keyed by time slice
for time_idx, num_in_slice in enumerate(dim_model.time_slices):
    song_topic_inf_by_year.append(song_topic_inf_matrix_copy[:num_in_slice])
    del song_topic_inf_matrix_copy[:num_in_slice]

In [12]:
for idx, song_topic_matrix in enumerate(song_topic_inf_by_year):
    print time_slice_dict[idx], ':', time_seq[idx], 'songs'
    print "==============================="

    # Get the indices of the most influential songs per topic
    # Note: Need to adjust indexing since index resets to 0 for each new year
    most_inf_idx = np.array(song_topic_matrix).argmax(axis=0) + sum(dim_model.time_slices[:idx])

    for topic_no, artist_idx in enumerate(most_inf_idx):
        print "Topic:", topic_no
        # Lookup artist name
        print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['name'].iloc[0]
        # Lookup name of sample
        print os.listdir(bow_path_by_artist[artist_idx][1])[0]
        print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['main_genre'].iloc[0]
        print

1912 : 1 songs
Topic: 0
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 1
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 2
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 3
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 4
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

1916 : 1 songs
Topic: 0
Sioux Falls
0_CopyPaste.npy
Pop/Rock

Topic: 1
Sioux Falls
0_CopyPaste.npy
Pop/Rock

Topic: 2
Sioux Falls
0_CopyPaste.npy
Pop/Rock

Topic: 3
Sioux Falls
0_CopyPaste.npy
Pop/Rock

Topic: 4
Sioux Falls
0_CopyPaste.npy
Pop/Rock

1919 : 1 songs
Topic: 0
Joe Morello
0_Blue Rondo à la Turk.npy
Jazz

Topic: 1
Joe Morello
0_Blue Rondo à la Turk.npy
Jazz

Topic: 2
Joe Morello
0_Blue Rondo à la Turk.npy
Jazz

Topic: 3
Joe Morello
0_Blue Rondo à la Turk.npy
Jazz

Topic: 4
Joe Morello
0_Blue Rondo à la Turk.npy
Jazz

1921 : 1 s

Topic: 3
Allan Sherman
0_Hello Muddah, Hello Fadduh! (A Letter from Camp).npy
Comedy/Spoken

Topic: 4
Perry Blake
0_Graines d'Etoiles.npy
Pop/Rock

1946 : 2 songs
Topic: 0
Pedro Infante
0_Cien Años.npy
Latin

Topic: 1
Pedro Infante
0_Cien Años.npy
Latin

Topic: 2
Token Entry
0_Lucky Seven.npy
Pop/Rock

Topic: 3
Pedro Infante
0_Cien Años.npy
Latin

Topic: 4
Pedro Infante
0_Cien Años.npy
Latin

1947 : 5 songs
Topic: 0
2 Live Jews
0_The Hustle (Parody: The Herschel).npy
Comedy/Spoken

Topic: 1
2 Live Jews
0_The Hustle (Parody: The Herschel).npy
Comedy/Spoken

Topic: 2
Moxie Raia
0_Follow Me.npy
Pop/Rock

Topic: 3
Liam Clancy
0_Jig.npy
International

Topic: 4
2 Live Jews
0_The Hustle (Parody: The Herschel).npy
Comedy/Spoken

1948 : 5 songs
Topic: 0
Zurdok
0_Estático.npy
Latin

Topic: 1
Billy Mitchell
0_Long Notes Blue (Here Is Cecco Beppe).npy
Jazz

Topic: 2
Denali
0_Cuz I'm Ballin.npy
Pop/Rock

Topic: 3
Tom Jobim
0_Águas De Março.npy
Latin

Topic: 4
Zurdok
0_Estático.npy
Latin

19

Topic: 0
Sandie Shaw
0_(There's) Always Something There to Remind Me.npy
Pop/Rock

Topic: 1
The Angels
0_My Boyfriend's Back.npy
Pop/Rock

Topic: 2
The Angels
0_My Boyfriend's Back.npy
Pop/Rock

Topic: 3
The Angels
0_My Boyfriend's Back.npy
Pop/Rock

Topic: 4
Sandie Shaw
0_(There's) Always Something There to Remind Me.npy
Pop/Rock

1973 : 103 songs
Topic: 0
The Statler Brothers
0_Flowers on the Wall.npy
Country

Topic: 1
Joe Hinton
0_Funny (How Time Slips Away).npy
R&B;

Topic: 2
The Knickerbockers
0_Lies.npy
Pop/Rock

Topic: 3
Gil Fuller
0_Be's That Way.npy
Jazz

Topic: 4
Jon Hendricks
0_Fire in the City.npy
Vocal

1974 : 133 songs
Topic: 0
Herman's Hermits
0_There's a Kind of Hush (All Over the World).npy
Pop/Rock

Topic: 1
The Herd
0_From the Underworld.npy
Pop/Rock

Topic: 2
Tammi Terrell
0_If I Could Build My Whole World Around You.npy
R&B;

Topic: 3
O.V. Wright
0_Eight Men, Four Women.npy
R&B;

Topic: 4
Ray Price
0_I'm Still Not over You.npy
Country

1975 : 126 songs
Topic: 0
Rhi

Topic: 1
Rahsaan Patterson
0_Tears on My Pillow.npy
R&B;

Topic: 2
Static-X
0_Push It.npy
Pop/Rock

Topic: 3
Nitin Sawhney
0_Homelands.npy
Electronic

Topic: 4
Jesse Powell
0_Tears on My Pillow.npy
R&B;

2001 : 410 songs
Topic: 0
Admiral Bailey
0_Punanny Medley.npy
Reggae

Topic: 1
Otis Clay
0_The Weight.npy
R&B;

Topic: 2
Front Line Assembly
0_New Year's Day.npy
Pop/Rock

Topic: 3
Tevin Campbell
0_Everything.npy
R&B;

Topic: 4
Lucy Pearl
0_You.npy
R&B;

2002 : 407 songs
Topic: 0
The Presidents of the United States of America
0_Nuthin But Luv.npy
Pop/Rock

Topic: 1
Groove Collective
0_Stargazer.npy
R&B;

Topic: 2
Army of Freshmen
0_Adeline.npy
Pop/Rock

Topic: 3
Big Bill Morganfield
0_You're Gonna Miss Me.npy
Blues

Topic: 4
Groove Collective
0_Stargazer.npy
R&B;

2003 : 444 songs
Topic: 0
Ruins
0_Fredmans Session 2.npy
Pop/Rock

Topic: 1
Morgan Geist
0_Ghost Trains.npy
Electronic

Topic: 2
Paul Gilbert
0_Let the Computer Decide.npy
Pop/Rock

Topic: 3
Morgan Geist
0_Ghost Trains.npy
El

# Correlation with AllMusic Influence Graph Degree

In [13]:
# Calculate max DIM influence per artist
max_dim_influences = np.array(song_topic_inf_matrix).max(axis=1)

# Get list of artist outdegrees in same order
artist_ids_ordered = [t[0] for t in bow_path_by_artist]
outdegrees_ordered = []

for id in artist_ids_ordered:
    outdegrees_ordered.append(artists[artists['id'] == id]['outdegree'].iloc[0])

In [14]:
# Calculate correlation
spearmanr(max_dim_influences, outdegrees_ordered)

SpearmanrResult(correlation=0.045207863759497452, pvalue=1.6639959398157967e-07)

# TODO: Baseline

# TODO: Breakdown by Genre