In [6]:
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
import numpy as np
import pandas as pd
import os
from copy import copy
from collections import OrderedDict
from scipy.stats import spearmanr
from tqdm import tqdm_notebook
import pickle

In [2]:
MODEL_PATH = '../models/dim_bow500_5topics_firsttrack_songreleasedate'
BOW_DIR = '../data/features/bow_500/'

In [3]:
# Load model
dim_model = DtmModel.load(MODEL_PATH)

In [4]:
# Load artist info
artists = pd.read_csv('../data/allmusic/artists_cleaned.csv')

In [5]:
# Load song info
songs = pd.read_csv('../data/artist_song_list_years_cleaned.csv')

In [7]:
# Load bow_path_by_artist
bow_path_by_artist = pickle.load(open('../models/bow500_path_by_artist.pk', 'rb'))

In [13]:
bow_path_by_artist_new = []

for (id, path, year) in bow_path_by_artist:
    bow_path_by_artist_new.append((id, BOW_DIR + path.split('/n/regal/rush_lab/xue/bow_500/')[1], year))

bow_path_by_artist = bow_path_by_artist_new

In [15]:
# # Add a column with a ".npy" extension
# songs['name_ext'] = songs['name_no_ext'].apply(lambda x: (x + '.npy')).str.decode('utf-8')

# # list of (artist_id, path, year) tuples
# bow_path_by_artist = []

# ids_in_songs = set(songs['artist_id'])

# for artist_id in tqdm_notebook(os.listdir(BOW_DIR)):
#     song_filename = os.listdir(BOW_DIR + artist_id)[0].decode('utf-8')
    
#     # Check if song year is missing or artist is missing from song df
#     if int(artist_id) in ids_in_songs and songs[songs['name_ext'] == song_filename]['year'].iloc[0] != 0:
#         # save (artist_id, path, year) tuple
#         bow_path_by_artist.append((int(artist_id), BOW_DIR + artist_id + '/', songs[songs['name_ext'] == song_filename]['year'].iloc[0]))

# print "Number of songs:", len(bow_path_by_artist)

# Order list by year
bow_path_by_artist.sort(key= lambda x: songs[songs['artist_id'] == x[0]]['year'].iloc[0])

# Create counter for number of songs for each year 
year_counter = {int(k) : 0 for k in np.unique(zip(*bow_path_by_artist)[2])}

for id, path, year in bow_path_by_artist:
    year_counter[year] += 1

# Lookup table for time_slice index v. year
time_slice_dict = {idx : year for (idx, year) in enumerate(sorted(year_counter))}
# List of counts for each time slice for DIM
time_seq = [year_counter[key] for key in sorted(year_counter.keys())]

print "Count of artists per each time slice:"
print time_seq

Count of artists per each time slice:
[1, 1, 1, 1, 3, 1, 1, 1, 5, 3, 2, 2, 1, 2, 2, 1, 1, 2, 2, 5, 2, 2, 7, 2, 5, 5, 5, 6, 4, 6, 11, 15, 30, 42, 37, 64, 44, 44, 42, 52, 62, 49, 84, 73, 100, 113, 117, 112, 94, 135, 103, 133, 126, 131, 112, 141, 152, 167, 155, 156, 160, 141, 164, 171, 172, 182, 175, 194, 211, 203, 266, 272, 324, 360, 361, 431, 361, 382, 410, 407, 444, 440, 407, 420, 406, 410, 318, 350, 272, 299, 377, 390, 277, 264, 188, 3]


# Get most influential songs for each topic

In [17]:
# Create (num_songs, num_topics) array of influence scores
song_topic_inf_matrix = []

for time_idx in range(len(dim_model.time_slices)):    
    for song_idx in range(dim_model.time_slices[time_idx]):
        song_inf_by_topic = [None] * dim_model.num_topics

        for topic_idx in range(dim_model.num_topics):
            song_inf_by_topic[topic_idx] = dim_model.influences_time[time_idx][song_idx][topic_idx]
        
        song_topic_inf_matrix.append(song_inf_by_topic)

In [18]:
# Get the indices of the most influential songs per topic
most_inf_idx = np.array(song_topic_inf_matrix).argmax(axis=0)

for topic_no, artist_idx in enumerate(most_inf_idx):
    print "Topic:", topic_no
    # Lookup artist name
    print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['name'].iloc[0]
    # Lookup name of sample
    print os.listdir(bow_path_by_artist[artist_idx][1])[0]
    
    print

Topic: 0
Clyde Stubblefield
0_I Don't Want Nobody to Give Me Nothin'.npy

Topic: 1
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy

Topic: 2
Big Jaz
0_Jigga WhatFaint.npy

Topic: 3
Big Jaz
0_Jigga WhatFaint.npy

Topic: 4
Sioux Falls
0_CopyPaste.npy



# Get most influential songs per topic per epoch

In [19]:
song_topic_inf_by_year = []
song_topic_inf_matrix_copy = copy(song_topic_inf_matrix)

# Slice song-topic matrix into list of submatrices keyed by time slice
for time_idx, num_in_slice in enumerate(dim_model.time_slices):
    song_topic_inf_by_year.append(song_topic_inf_matrix_copy[:num_in_slice])
    del song_topic_inf_matrix_copy[:num_in_slice]

In [20]:
for idx, song_topic_matrix in enumerate(song_topic_inf_by_year):
    print time_slice_dict[idx], ':', time_seq[idx], 'songs'
    print "==============================="

    # Get the indices of the most influential songs per topic
    # Note: Need to adjust indexing since index resets to 0 for each new year
    most_inf_idx = np.array(song_topic_matrix).argmax(axis=0) + sum(dim_model.time_slices[:idx])

    for topic_no, artist_idx in enumerate(most_inf_idx):
        print "Topic:", topic_no
        # Lookup artist name
        print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['name'].iloc[0]
        # Lookup name of sample
        print os.listdir(bow_path_by_artist[artist_idx][1])[0]
        print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['main_genre'].iloc[0]
        print

1912 : 1 songs
Topic: 0
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 1
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 2
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 3
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 4
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

1916 : 1 songs
Topic: 0
Sioux Falls
0_CopyPaste.npy
Pop/Rock

Topic: 1
Sioux Falls
0_CopyPaste.npy
Pop/Rock

Topic: 2
Sioux Falls
0_CopyPaste.npy
Pop/Rock

Topic: 3
Sioux Falls
0_CopyPaste.npy
Pop/Rock

Topic: 4
Sioux Falls
0_CopyPaste.npy
Pop/Rock

1919 : 1 songs
Topic: 0
Joe Morello
0_Blue Rondo à la Turk.npy
Jazz

Topic: 1
Joe Morello
0_Blue Rondo à la Turk.npy
Jazz

Topic: 2
Joe Morello
0_Blue Rondo à la Turk.npy
Jazz

Topic: 3
Joe Morello
0_Blue Rondo à la Turk.npy
Jazz

Topic: 4
Joe Morello
0_Blue Rondo à la Turk.npy
Jazz

1921 : 1 s

North Mississippi Allstars
0_Blood on That Rock.npy
Pop/Rock

1950 : 6 songs
Topic: 0
U-Roy
0_Wear You to the Ball.npy
Reggae

Topic: 1
Cut Chemist
0_Luv U Better.npy
Rap

Topic: 2
Streetwize
0_Don't Say Goodnight (It's Time for Love).npy
Jazz

Topic: 3
Cut Chemist
0_Luv U Better.npy
Rap

Topic: 4
King Leg
0_Comfy ChairA Dream That Never Ends.npy
Pop/Rock

1951 : 4 songs
Topic: 0
David Grier
0_Porkchops & Applesauce.npy
Country

Topic: 1
Ari Up
0_Chrome Optimism -[ Oxygen Part 4 Dub].npy
Pop/Rock

Topic: 2
David Grier
0_Porkchops & Applesauce.npy
Country

Topic: 3
Ari Up
0_Chrome Optimism -[ Oxygen Part 4 Dub].npy
Pop/Rock

Topic: 4
Ari Up
0_Chrome Optimism -[ Oxygen Part 4 Dub].npy
Pop/Rock

1952 : 6 songs
Topic: 0
I Am the World Trade Center
0_Shoot You Down.npy
Pop/Rock

Topic: 1
Javen
0_Wipe Away My Tears.npy
R&B;

Topic: 2
Philharmonie
0_Hannibal À Capoue (Hannibal at Capoue).npy
Pop/Rock

Topic: 3
Tommy Smith
0_Independency, Pt. 1.npy
Jazz

Topic: 4
I Am the World Trade Center
0

Highway 101
0_Young Country.npy
Country

1980 : 167 songs
Topic: 0
Shirley Caesar
0_No Charge.npy
Religious

Topic: 1
Cornell Campbell
0_No Good Girl.npy
Reggae

Topic: 2
Cornell Campbell
0_No Good Girl.npy
Reggae

Topic: 3
Cornell Campbell
0_No Good Girl.npy
Reggae

Topic: 4
Be Bop Deluxe
0_Ships in the Night.npy
Pop/Rock

1981 : 155 songs
Topic: 0
Bob Marley
0_Keep on Movin'.npy
Reggae

Topic: 1
Cameo
0_In the Night.npy
R&B;

Topic: 2
Cameo
0_In the Night.npy
R&B;

Topic: 3
Sherbet
0_Summer Love.npy
Pop/Rock

Topic: 4
Donna Fargo
0_Jingle Bells.npy
Country

1982 : 156 songs
Topic: 0
Johnny Paycheck
0_Maybellene.npy
Country

Topic: 1
Sun
0_Sun Is Here.npy
R&B;

Topic: 2
Stuff
0_Signed, Sealed, Delivered I'm Yours.npy
R&B;

Topic: 3
Riders in the Sky
0_Back in the Saddle Again.npy
Country

Topic: 4
Lit
0_Borrowed Time.npy
Pop/Rock

1983 : 160 songs
Topic: 0
Stevie Wonder
0_One Little Christmas Tree.npy
R&B;

Topic: 1
Chris LeDoux
0_Copenhagen.npy
Country

Topic: 2
Paul Williams
0_Rainb

0_Bright Lights, Big City.npy
Blues

Topic: 1
Meghan Trainor
0_All About That Bass [Radio Disney Version].npy
Pop/Rock

Topic: 2
Ana Moura
0_Dream of Fire.npy
International

Topic: 3
Elvis Perkins
0_Moon Woman, Pt. 2.npy
Pop/Rock

Topic: 4
Joe Hertler & the Rainbow Seekers
0_King is Dead.npy
Pop/Rock

2016 : 264 songs
Topic: 0
Boys Like Girls
0_The Great Escape.npy
Pop/Rock

Topic: 1
James Supercave
0_Burn.npy
Pop/Rock

Topic: 2
Eddie Bo
0_St. James Infirmary.npy
R&B;

Topic: 3
Joshua Redman
0_Ornithology.npy
Jazz

Topic: 4
The Dead Ships
0_Canyon.npy
Pop/Rock

2017 : 188 songs
Topic: 0
Dan Hicks & His Hot Licks
0_Strike It While It's Hot.npy
Pop/Rock

Topic: 1
Starrah
0_Imperfections.npy
R&B;

Topic: 2
The Frightnrs
0_Nothing More to Say.npy
Pop/Rock

Topic: 3
Terry Reid
0_Listen.npy
Pop/Rock

Topic: 4
Esmé Patterson
0_Call It Heaven.npy
Pop/Rock

2018 : 3 songs
Topic: 0
The Black Dahlia Murder
0_Catacomb Hecatomb.npy
Pop/Rock

Topic: 1
The Black Dahlia Murder
0_Catacomb Hecatomb.npy


# Correlation with AllMusic Influence Graph Degree

In [36]:
# Calculate max DIM influence per artist
max_dim_influences = np.array(song_topic_inf_matrix).max(axis=1)

# Get list of artist outdegrees in same order
artist_ids_ordered = [t[0] for t in bow_path_by_artist]
outdegrees_ordered = []

for id in artist_ids_ordered:
    outdegrees_ordered.append(artists[artists['id'] == id]['outdegree'].iloc[0])

In [37]:
# Calculate correlation
spearmanr(max_dim_influences, outdegrees_ordered)

SpearmanrResult(correlation=0.050810378893831672, pvalue=4.0342621408735467e-09)

# TODO: Baseline

# TODO: Breakdown by Genre