In [1]:
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
import numpy as np
import pandas as pd
import os
from copy import copy
from collections import OrderedDict
from scipy.stats import spearmanr

In [2]:
MODEL_PATH = '../models/dim_bow500_10topics_firsttrack_artistactivedate'

In [3]:
# Load model
dim_model = DtmModel.load(MODEL_PATH)

In [4]:
# Load artist info
artists = pd.read_csv('../data/allmusic/artists_cleaned.csv')

In [5]:
# list of (artist_id, path) tuples
bow_path_by_artist = []

# Get paths to bow directories for each artist
BOW_DIR = '../data/features/bow_500/'

for artist_id in os.listdir(BOW_DIR):
    # Check if active_start is missing
    if int(np.isnan(float(artists[artists['id'] == int(artist_id)] ['active_start']))) == 0:
        # save (artist_id, path, active_start) tuple
        bow_path_by_artist.append((int(artist_id), BOW_DIR + artist_id + '/', int(artists[artists['id'] == int(artist_id)] ['active_start'])))

# Order list by active period start for artist
bow_path_by_artist.sort(key= lambda x: int(artists[artists['id'] == x[0]] ['active_start']))

# Create counter for number of songs for each decade of active_start
decade_counter = {int(k) : 0 for k in np.unique(artists['active_start'][~np.isnan(artists['active_start'])])}

for id, path, year in bow_path_by_artist:
    decade_counter[year] += 1

# Lookup table for time_slice index v. decade
time_slice_dict = {idx : year for (idx, year) in enumerate(sorted(decade_counter))}
# List of counts for each time slice for DIM
time_seq = [decade_counter[key] for key in sorted(decade_counter.keys())]

# Get most influential songs for each topic

In [6]:
# Create (num_songs, num_topics) array of influence scores
song_topic_inf_matrix = []

for time_idx in range(len(dim_model.time_slices)):    
    for song_idx in range(dim_model.time_slices[time_idx]):
        song_inf_by_topic = [None] * dim_model.num_topics

        for topic_idx in range(dim_model.num_topics):
            song_inf_by_topic[topic_idx] = dim_model.influences_time[time_idx][song_idx][topic_idx]
        
        song_topic_inf_matrix.append(song_inf_by_topic)

In [7]:
# Get the indices of the most influential songs per topic
most_inf_idx = np.array(song_topic_inf_matrix).argmax(axis=0)

for topic_no, artist_idx in enumerate(most_inf_idx):
    print "Topic:", topic_no
    # Lookup artist name
    print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['name'].iloc[0]
    # Lookup name of sample
    print os.listdir(bow_path_by_artist[artist_idx][1])[0]
    
    print

Topic: 0
Sergey Rachmaninov
0_Prelude in C Sharp Minor Op. 32.npy

Topic: 1
Bert Williams
0_The Moon Shines on the Moonshine.npy

Topic: 2
Ferruccio Busoni
0_La La Campanella.npy

Topic: 3
Paul Dukas
0_Fantasmic! [From Fantasmic! (Disneyland)].npy

Topic: 4
Fisk University Jubilee Singers
0_God Bless the USA.npy

Topic: 5
Ferruccio Busoni
0_La La Campanella.npy

Topic: 6
Fisk University Jubilee Singers
0_God Bless the USA.npy

Topic: 7
Fisk University Jubilee Singers
0_God Bless the USA.npy

Topic: 8
Ferruccio Busoni
0_La La Campanella.npy

Topic: 9
Edgard Varèse
0_Poème Électronique.npy



# Get most influential songs per topic per epoch

In [8]:
song_topic_inf_by_decade = []
song_topic_inf_matrix_copy = copy(song_topic_inf_matrix)

# Slice song-topic matrix into list of submatrices keyed by time slice
for time_idx, num_in_slice in enumerate(dim_model.time_slices):
    song_topic_inf_by_decade.append(song_topic_inf_matrix_copy[:num_in_slice])
    del song_topic_inf_matrix_copy[:num_in_slice]

In [9]:
for idx, song_topic_matrix in enumerate(song_topic_inf_by_decade):
    print time_slice_dict[idx]
    print "==============================="
    
    # Get the indices of the most influential songs per topic
    # Note: Need to adjust indexing since index resets to 0 for each new decade
    most_inf_idx = np.array(song_topic_matrix).argmax(axis=0) + sum(dim_model.time_slices[:idx])

    for topic_no, artist_idx in enumerate(most_inf_idx):
        print "Topic:", topic_no
        # Lookup artist name
        print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['name'].iloc[0]
        # Lookup name of sample
        print os.listdir(bow_path_by_artist[artist_idx][1])[0]
        print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['main_genre'].iloc[0]
        print

1890
Topic: 0
Sergey Rachmaninov
0_Prelude in C Sharp Minor Op. 32.npy
Classical

Topic: 1
Camille Saint-Saëns
0_Aquarium.npy
Classical

Topic: 2
Ferruccio Busoni
0_La La Campanella.npy
Classical

Topic: 3
Paul Dukas
0_Fantasmic! [From Fantasmic! (Disneyland)].npy
Classical

Topic: 4
Ferruccio Busoni
0_La La Campanella.npy
Classical

Topic: 5
Ferruccio Busoni
0_La La Campanella.npy
Classical

Topic: 6
Paul Dukas
0_Fantasmic! [From Fantasmic! (Disneyland)].npy
Classical

Topic: 7
Ferruccio Busoni
0_La La Campanella.npy
Classical

Topic: 8
Ferruccio Busoni
0_La La Campanella.npy
Classical

Topic: 9
Ferruccio Busoni
0_La La Campanella.npy
Classical

1900
Topic: 0
George Burns
0_The Jack Benny Program.npy
Comedy/Spoken

Topic: 1
Bert Williams
0_The Moon Shines on the Moonshine.npy
Vocal

Topic: 2
George Burns
0_The Jack Benny Program.npy
Comedy/Spoken

Topic: 3
George Burns
0_The Jack Benny Program.npy
Comedy/Spoken

Topic: 4
Fisk University Jubilee Singers
0_God Bless the USA.npy
Religiou

# Correlation with AllMusic Influence Graph Degree

In [10]:
# Calculate mean DIM influence per artist
mean_dim_influences = np.array(song_topic_inf_matrix).mean(axis=1)

# Get list of artist outdegrees in same order
artist_ids_ordered = [t[0] for t in bow_path_by_artist]
outdegrees_ordered = []

for id in artist_ids_ordered:
    outdegrees_ordered.append(artists[artists['id'] == id]['outdegree'].iloc[0])

In [11]:
# Calculate correlation
spearmanr(mean_dim_influences, outdegrees_ordered)

SpearmanrResult(correlation=0.016283108903372225, pvalue=0.050110958352326096)

# TODO: Baseline

# TODO: Breakdown by Genre