In [1]:
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
import numpy as np
import pandas as pd
import os

In [2]:
MODEL_PATH = '../models/dim_bow500_10topics_firsttrack_artistactivedate'

In [3]:
# Load model
dim_model = DtmModel.load(MODEL_PATH)

In [4]:
# Load artist info
artists = pd.read_csv('../data/allmusic/artists_cleaned.csv')

In [5]:
# list of (artist_id, path) tuples
bow_path_by_artist = []

# Get paths to bow directories for each artist
BOW_DIR = '../data/features/bow_500/'

for artist_id in os.listdir(BOW_DIR):
    # Check if active_start is missing
    if int(np.isnan(float(artists[artists['id'] == int(artist_id)] ['active_start']))) == 0:
        # save (artist_id, path, active_start) tuple
        bow_path_by_artist.append((int(artist_id), BOW_DIR + artist_id + '/', int(artists[artists['id'] == int(artist_id)] ['active_start'])))

# Order list by active period start for artist
bow_path_by_artist.sort(key= lambda x: int(artists[artists['id'] == x[0]] ['active_start']))

# Create counter for number of songs for each decade of active_start
decade_counter = {int(k) : 0 for k in np.unique(artists['active_start'][~np.isnan(artists['active_start'])])}

for id, path, year in bow_path_by_artist:
    decade_counter[year] += 1

# Lookup table for time_slice index v. decade
time_slice_dict = {idx : year for (idx, year) in enumerate(sorted(decade_counter))}
# List of counts for each time slice for DIM
time_seq = [decade_counter[key] for key in sorted(decade_counter.keys())]

# Get most influential songs for each topic

In [6]:
# Create (num_songs, num_topics) array of influence scores
song_topic_inf_matrix = []

for time_idx in range(len(dim_model.time_slices)):    
    for song_idx in range(dim_model.time_slices[time_idx]):
        song_inf_by_topic = [None] * dim_model.num_topics

        for topic_idx in range(dim_model.num_topics):
            song_inf_by_topic[topic_idx] = dim_model.influences_time[time_idx][song_idx][topic_idx]
        
        song_topic_inf_matrix.append(song_inf_by_topic)

song_topic_inf_matrix = np.array(song_topic_inf_matrix)

In [7]:
# Get the indices of the most influential songs per topic
most_inf_idx = song_topic_inf_matrix.argmax(axis=0)

for topic_no, artist_idx in enumerate(most_inf_idx):
    print "Topic", topic_no
    # Lookup artist name
    print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['name']
    # Lookup name of sample
    print os.listdir(bow_path_by_artist[artist_idx][1])[0]
    
    print

Topic 0
991    Sergey Rachmaninov
Name: name, dtype: object
0_Prelude in C Sharp Minor Op. 32.npy

Topic 1
7863    Bert Williams
Name: name, dtype: object
0_The Moon Shines on the Moonshine.npy

Topic 2
2184    Ferruccio Busoni
Name: name, dtype: object
0_La La Campanella.npy

Topic 3
10160    Paul Dukas
Name: name, dtype: object
0_Fantasmic! [From Fantasmic! (Disneyland)].npy

Topic 4
8312    Fisk University Jubilee Singers
Name: name, dtype: object
0_God Bless the USA.npy

Topic 5
2184    Ferruccio Busoni
Name: name, dtype: object
0_La La Campanella.npy

Topic 6
8312    Fisk University Jubilee Singers
Name: name, dtype: object
0_God Bless the USA.npy

Topic 7
8312    Fisk University Jubilee Singers
Name: name, dtype: object
0_God Bless the USA.npy

Topic 8
2184    Ferruccio Busoni
Name: name, dtype: object
0_La La Campanella.npy

Topic 9
1850    Edgard Varèse
Name: name, dtype: object
0_Poème Électronique.npy



# Correlation with AllMusic Influence Graph Degree

# TODO: Baseline

# TODO: Breakdown by Genre