In [1]:
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
import numpy as np
import pandas as pd
import os
from copy import copy
from collections import OrderedDict
from scipy.stats import spearmanr
from tqdm import tqdm_notebook
import pickle

In [2]:
MODEL_PATH = '../models/dim_bow500_10topics_firsttrack_songreleasedate'
BOW_DIR = '../data/features/bow_500/'

In [3]:
# Load model
dim_model = DtmModel.load(MODEL_PATH)

In [4]:
# Load artist info
artists = pd.read_csv('../data/allmusic/artists_cleaned.csv')

In [5]:
# Load song info
songs = pd.read_csv('../data/artist_song_list_years_cleaned.csv')

In [6]:
# Load bow_path_by_artist
bow_path_by_artist = pickle.load(open('../models/bow500_path_by_artist.pk', 'rb'))

In [7]:
bow_path_by_artist_new = []

for (id, path, year) in bow_path_by_artist:
    bow_path_by_artist_new.append((id, BOW_DIR + path.split('/n/regal/rush_lab/xue/bow_500/')[1], year))

bow_path_by_artist = bow_path_by_artist_new

In [8]:
# # Add a column with a ".npy" extension
# songs['name_ext'] = songs['name_no_ext'].apply(lambda x: (x + '.npy')).str.decode('utf-8')

# # list of (artist_id, path, year) tuples
# bow_path_by_artist = []

# ids_in_songs = set(songs['artist_id'])

# for artist_id in tqdm_notebook(os.listdir(BOW_DIR)):
#     song_filename = os.listdir(BOW_DIR + artist_id)[0].decode('utf-8')
    
#     # Check if song year is missing or artist is missing from song df
#     if int(artist_id) in ids_in_songs and songs[songs['name_ext'] == song_filename]['year'].iloc[0] != 0:
#         # save (artist_id, path, year) tuple
#         bow_path_by_artist.append((int(artist_id), BOW_DIR + artist_id + '/', songs[songs['name_ext'] == song_filename]['year'].iloc[0]))

# print "Number of songs:", len(bow_path_by_artist)

# Order list by year
bow_path_by_artist.sort(key= lambda x: songs[songs['artist_id'] == x[0]]['year'].iloc[0])

# Create counter for number of songs for each year 
year_counter = {int(k) : 0 for k in np.unique(zip(*bow_path_by_artist)[2])}

for id, path, year in bow_path_by_artist:
    year_counter[year] += 1

# Lookup table for time_slice index v. year
time_slice_dict = {idx : year for (idx, year) in enumerate(sorted(year_counter))}
# List of counts for each time slice for DIM
time_seq = [year_counter[key] for key in sorted(year_counter.keys())]

print "Count of artists per each time slice:"
print time_seq

Count of artists per each time slice:
[1, 1, 1, 1, 3, 1, 1, 1, 5, 3, 2, 2, 1, 2, 2, 1, 1, 2, 2, 5, 2, 2, 7, 2, 5, 5, 5, 6, 4, 6, 11, 15, 30, 42, 37, 64, 44, 44, 42, 52, 62, 49, 84, 73, 100, 113, 117, 112, 94, 135, 103, 133, 126, 131, 112, 141, 152, 167, 155, 156, 160, 141, 164, 171, 172, 182, 175, 194, 211, 203, 266, 272, 324, 360, 361, 431, 361, 382, 410, 407, 444, 440, 407, 420, 406, 410, 318, 350, 272, 299, 377, 390, 277, 264, 188, 3]


# Get most influential songs for each topic

In [9]:
# Create (num_songs, num_topics) array of influence scores
song_topic_inf_matrix = []

for time_idx in range(len(dim_model.time_slices)):    
    for song_idx in range(dim_model.time_slices[time_idx]):
        song_inf_by_topic = [None] * dim_model.num_topics

        for topic_idx in range(dim_model.num_topics):
            song_inf_by_topic[topic_idx] = dim_model.influences_time[time_idx][song_idx][topic_idx]
        
        song_topic_inf_matrix.append(song_inf_by_topic)

In [10]:
# Get the indices of the most influential songs per topic
most_inf_idx = np.array(song_topic_inf_matrix).argmax(axis=0)

for topic_no, artist_idx in enumerate(most_inf_idx):
    print "Topic:", topic_no
    # Lookup artist name
    print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['name'].iloc[0]
    # Lookup name of sample
    print os.listdir(bow_path_by_artist[artist_idx][1])[0]
    
    print

Topic: 0
Big Jaz
0_Jigga WhatFaint.npy

Topic: 1
Big Jaz
0_Jigga WhatFaint.npy

Topic: 2
Sophie Milman
0_(Getting Some) Fun Out of Life.npy

Topic: 3
Carlinhos Brown
0_Passe Em Casa.npy

Topic: 4
Clyde Stubblefield
0_I Don't Want Nobody to Give Me Nothin'.npy

Topic: 5
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy

Topic: 6
Clyde Stubblefield
0_I Don't Want Nobody to Give Me Nothin'.npy

Topic: 7
Sioux Falls
0_CopyPaste.npy

Topic: 8
Don Chezina
0_Por Mi Reggae Muero.npy

Topic: 9
Half Japanese
0_After Mendelssohn (137 Years)Paint It BlackArthur's ....npy



# Get most influential songs per topic per epoch

In [11]:
song_topic_inf_by_year = []
song_topic_inf_matrix_copy = copy(song_topic_inf_matrix)

# Slice song-topic matrix into list of submatrices keyed by time slice
for time_idx, num_in_slice in enumerate(dim_model.time_slices):
    song_topic_inf_by_year.append(song_topic_inf_matrix_copy[:num_in_slice])
    del song_topic_inf_matrix_copy[:num_in_slice]

In [12]:
for idx, song_topic_matrix in enumerate(song_topic_inf_by_year):
    print time_slice_dict[idx], ':', time_seq[idx], 'songs'
    print "==============================="

    # Get the indices of the most influential songs per topic
    # Note: Need to adjust indexing since index resets to 0 for each new year
    most_inf_idx = np.array(song_topic_matrix).argmax(axis=0) + sum(dim_model.time_slices[:idx])

    for topic_no, artist_idx in enumerate(most_inf_idx):
        print "Topic:", topic_no
        # Lookup artist name
        print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['name'].iloc[0]
        # Lookup name of sample
        print os.listdir(bow_path_by_artist[artist_idx][1])[0]
        print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['main_genre'].iloc[0]
        print

1912 : 1 songs
Topic: 0
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 1
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 2
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 3
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 4
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 5
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 6
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 7
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 8
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 9
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

1916 : 1 songs
Topic: 0
Sioux Falls
0_CopyPaste.npy
Pop/Rock

Topic: 1
Sioux Falls
0_CopyPaste.npy
Pop/Rock

Topic:

Topic: 2
Pedro Infante
0_Cien Años.npy
Latin

Topic: 3
Pedro Infante
0_Cien Años.npy
Latin

Topic: 4
Pedro Infante
0_Cien Años.npy
Latin

Topic: 5
Token Entry
0_Lucky Seven.npy
Pop/Rock

Topic: 6
Pedro Infante
0_Cien Años.npy
Latin

Topic: 7
Token Entry
0_Lucky Seven.npy
Pop/Rock

Topic: 8
Pedro Infante
0_Cien Años.npy
Latin

Topic: 9
Token Entry
0_Lucky Seven.npy
Pop/Rock

1947 : 5 songs
Topic: 0
Liam Clancy
0_Jig.npy
International

Topic: 1
Moxie Raia
0_Follow Me.npy
Pop/Rock

Topic: 2
2 Live Jews
0_The Hustle (Parody: The Herschel).npy
Comedy/Spoken

Topic: 3
2 Live Jews
0_The Hustle (Parody: The Herschel).npy
Comedy/Spoken

Topic: 4
2 Live Jews
0_The Hustle (Parody: The Herschel).npy
Comedy/Spoken

Topic: 5
Moxie Raia
0_Follow Me.npy
Pop/Rock

Topic: 6
2 Live Jews
0_The Hustle (Parody: The Herschel).npy
Comedy/Spoken

Topic: 7
2 Live Jews
0_The Hustle (Parody: The Herschel).npy
Comedy/Spoken

Topic: 8
Moxie Raia
0_Follow Me.npy
Pop/Rock

Topic: 9
Rickey Woodard
0_Rudolph the R

1959 : 44 songs
Topic: 0
The Congos
0_Fisherman.npy
Reggae

Topic: 1
Dabrye
0_The Plum BlossomGame OverBeej-N-Dem, Pt. 2Rock SteadyNTT DocomoGalaxy.npy
Rap

Topic: 2
Pete Nice
0_Fallin Spiders.npy
Rap

Topic: 3
Little Mike & the Tornadoes
0_What About Love.npy
Blues

Topic: 4
Pete Nice
0_Fallin Spiders.npy
Rap

Topic: 5
Little Mike & the Tornadoes
0_What About Love.npy
Blues

Topic: 6
Crisis
0_Mr. President.npy
Pop/Rock

Topic: 7
Laika & the Cosmonauts
0_Fadeaway.npy
Pop/Rock

Topic: 8
Oscar Brand
0_Rum a Dum Dum.npy
Folk

Topic: 9
Dabrye
0_The Plum BlossomGame OverBeej-N-Dem, Pt. 2Rock SteadyNTT DocomoGalaxy.npy
Rap

1960 : 44 songs
Topic: 0
The Christianaires
0_Stand Up.npy
Religious

Topic: 1
Oliver Morgan
0_Trick Bag.npy
R&B;

Topic: 2
The Royal Guardsmen
0_Imperial Bedroom.npy
Pop/Rock

Topic: 3
Chill Rob G.
0_PowerSign Off.npy
Rap

Topic: 4
The Royal Guardsmen
0_Imperial Bedroom.npy
Pop/Rock

Topic: 5
All Saints
0_Lady Marmalade.npy
Pop/Rock

Topic: 6
Hints
0_Foothills.npy
Pop/Ro

1977 : 112 songs
Topic: 0
Dennis Alcapone
0_Spanish Amigo.npy
Reggae

Topic: 1
Shel Silverstein
0_A Front Row Seat to Hear Ole Johnny Sing.npy
Folk

Topic: 2
Shel Silverstein
0_A Front Row Seat to Hear Ole Johnny Sing.npy
Folk

Topic: 3
Michael Nesmith
0_Joanne.npy
Pop/Rock

Topic: 4
Orquesta Aragón
0_Pare Cochero.npy
Latin

Topic: 5
Michael Nesmith
0_Joanne.npy
Pop/Rock

Topic: 6
Los del Rio
0_La Canto.npy
Latin

Topic: 7
The Jimi Hendrix Experience
0_Manic Depression.npy
Pop/Rock

Topic: 8
Shel Silverstein
0_A Front Row Seat to Hear Ole Johnny Sing.npy
Folk

Topic: 9
Shel Silverstein
0_A Front Row Seat to Hear Ole Johnny Sing.npy
Folk

1978 : 141 songs
Topic: 0
G. Love
0_Rainbow.npy
Pop/Rock

Topic: 1
Son Seals
0_We Wish You a Merry Christmas.npy
Blues

Topic: 2
G. Love
0_Rainbow.npy
Pop/Rock

Topic: 3
Thom Bell
0_You'll Never Get to Heaven (If You Break My Heart).npy
R&B;

Topic: 4
Dallas Frazier
0_The Laying on of Hands.npy
Country

Topic: 5
Groundhogs
0_Little Dreamer.npy
Blues

T

0_Short Fuse Blues.npy
Blues

Topic: 6
Swampwater
0_Freeborn Man.npy
Pop/Rock

Topic: 7
Dave Hole
0_Short Fuse Blues.npy
Blues

Topic: 8
Public Enemy
0_911 Is a Joke.npy
Rap

Topic: 9
Dave Hole
0_Short Fuse Blues.npy
Blues

1994 : 272 songs
Topic: 0
London
0_Ha Ha.npy
Pop/Rock

Topic: 1
Will Lee
0_Spherical.npy
Jazz

Topic: 2
Los Secretos
0_Ojos de Gata.npy
Latin

Topic: 3
Will Lee
0_Spherical.npy
Jazz

Topic: 4
Gem
0_Jump to the Groove.npy
International

Topic: 5
Joe Satriani
0_Hey Stoopid.npy
Pop/Rock

Topic: 6
Lee Allen
0_Everyday I Have the Blues.npy
R&B;

Topic: 7
London
0_Ha Ha.npy
Pop/Rock

Topic: 8
Roy Rogers
0_Rodeo Road.npy
Country

Topic: 9
Mad Cobra
0_Flex.npy
Reggae

1995 : 324 songs
Topic: 0
Meat Loaf
0_I'd Do Anything for Love (But I Won't Do That).npy
Pop/Rock

Topic: 1
Armando
0_Robotics.npy
Electronic

Topic: 2
Armando
0_Robotics.npy
Electronic

Topic: 3
Stan Ridgway
0_Lonely Town.npy
Pop/Rock

Topic: 4
Das EFX
0_Check Yo Self.npy
Rap

Topic: 5
Rorschach
0_Shanks.npy


Topic: 2
Los Tigres del Norte
0_Jefe De Jefes.npy
Latin

Topic: 3
Los Tigres del Norte
0_Jefe De Jefes.npy
Latin

Topic: 4
Swag
0_Do You Mind.npy
Electronic

Topic: 5
Brand New
0_Soco Amaretto Lime.npy
Pop/Rock

Topic: 6
Fat Pimp
0_I'm Gettin' Money.npy
Rap

Topic: 7
Los Tigres del Norte
0_Jefe De Jefes.npy
Latin

Topic: 8
Michael Giacchino
0_Super 8 Suite.npy
Stage & Screen

Topic: 9
Hugh Le Caine
0_Concertando Rubato.npy
Avant-Garde

2012 : 299 songs
Topic: 0
Bat for Lashes
0_Let's Get Lost.npy
Pop/Rock

Topic: 1
Frankie Lee
0_Bring It on Home to Me.npy
Blues

Topic: 2
Betty Wright
0_Look Around (Be a Man).npy
R&B;

Topic: 3
Twin Sister
0_Bad Street.npy
Pop/Rock

Topic: 4
J Dilla
0_E=MC2.npy
Rap

Topic: 5
MC Lars
0_Hot Topic Is Not Punk Rock.npy
Rap

Topic: 6
J Dilla
0_E=MC2.npy
Rap

Topic: 7
Michael Calfan
0_The Wave [Thomas Gold Remix Edit].npy
Electronic

Topic: 8
Milosh
0_Tailor-Made.npy
Pop/Rock

Topic: 9
J Dilla
0_E=MC2.npy
Rap

2013 : 377 songs
Topic: 0
Avicii
0_Wake Me Up [Ra

# Correlation with AllMusic Influence Graph Degree

In [17]:
# Calculate max DIM influence per artist
max_dim_influences = np.array(song_topic_inf_matrix).max(axis=1)

# Get list of artist outdegrees in same order
artist_ids_ordered = [t[0] for t in bow_path_by_artist]
outdegrees_ordered = []

for id in artist_ids_ordered:
    outdegrees_ordered.append(artists[artists['id'] == id]['outdegree'].iloc[0])

In [18]:
# Calculate correlation
spearmanr(max_dim_influences, outdegrees_ordered)

SpearmanrResult(correlation=0.057220508652822306, pvalue=3.4469135135121979e-11)

# TODO: Baseline

# TODO: Breakdown by Genre