In [184]:
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
import numpy as np
import pandas as pd
import os
from copy import copy
from collections import OrderedDict
from scipy.stats import spearmanr

In [94]:
MODEL_PATH = '../models/dim_bow500_5topics_firsttrack_artistactivedate'

In [95]:
# Load model
dim_model = DtmModel.load(MODEL_PATH)

In [96]:
# Load artist info
artists = pd.read_csv('../data/allmusic/artists_cleaned.csv')

In [97]:
# list of (artist_id, path) tuples
bow_path_by_artist = []

# Get paths to bow directories for each artist
BOW_DIR = '../data/features/bow_500/'

for artist_id in os.listdir(BOW_DIR):
    # Check if active_start is missing
    if int(np.isnan(float(artists[artists['id'] == int(artist_id)] ['active_start']))) == 0:
        # save (artist_id, path, active_start) tuple
        bow_path_by_artist.append((int(artist_id), BOW_DIR + artist_id + '/', int(artists[artists['id'] == int(artist_id)] ['active_start'])))

# Order list by active period start for artist
bow_path_by_artist.sort(key= lambda x: int(artists[artists['id'] == x[0]] ['active_start']))

# Create counter for number of songs for each decade of active_start
decade_counter = {int(k) : 0 for k in np.unique(artists['active_start'][~np.isnan(artists['active_start'])])}

for id, path, year in bow_path_by_artist:
    decade_counter[year] += 1

# Lookup table for time_slice index v. decade
time_slice_dict = {idx : year for (idx, year) in enumerate(sorted(decade_counter))}
# List of counts for each time slice for DIM
time_seq = [decade_counter[key] for key in sorted(decade_counter.keys())]

# Get most influential songs for each topic

In [115]:
# Create (num_songs, num_topics) array of influence scores
song_topic_inf_matrix = []

for time_idx in range(len(dim_model.time_slices)):    
    for song_idx in range(dim_model.time_slices[time_idx]):
        song_inf_by_topic = [None] * dim_model.num_topics

        for topic_idx in range(dim_model.num_topics):
            song_inf_by_topic[topic_idx] = dim_model.influences_time[time_idx][song_idx][topic_idx]
        
        song_topic_inf_matrix.append(song_inf_by_topic)

In [155]:
# Get the indices of the most influential songs per topic
most_inf_idx = np.array(song_topic_inf_matrix).argmax(axis=0)

for topic_no, artist_idx in enumerate(most_inf_idx):
    print "Topic:", topic_no
    # Lookup artist name
    print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['name'].iloc[0]
    # Lookup name of sample
    print os.listdir(bow_path_by_artist[artist_idx][1])[0]
    
    print

Topic: 0
Gabriel Fauré
0_Sicilenne.npy

Topic: 1
Sergey Rachmaninov
0_Prelude in C Sharp Minor Op. 32.npy

Topic: 2
Al Jolson
0_You Made Me Love You.npy

Topic: 3
Gilbert & Sullivan
0_If You Want to Know Who We Are.npy

Topic: 4
Al Jolson
0_You Made Me Love You.npy



# Get most influential songs per topic per epoch

In [167]:
song_topic_inf_by_decade = []
song_topic_inf_matrix_copy = copy(song_topic_inf_matrix)

# Slice song-topic matrix into list of submatrices keyed by time slice
for time_idx, num_in_slice in enumerate(dim_model.time_slices):
    song_topic_inf_by_decade.append(song_topic_inf_matrix_copy[:num_in_slice])
    del song_topic_inf_matrix_copy[:num_in_slice]

In [170]:
for idx, song_topic_matrix in enumerate(song_topic_inf_by_decade):
    print time_slice_dict[idx]
    print "==============================="
    
    # Get the indices of the most influential songs per topic
    # Note: Need to adjust indexing since index resets to 0 for each new decade
    most_inf_idx = np.array(song_topic_matrix).argmax(axis=0) + sum(dim_model.time_slices[:idx])

    for topic_no, artist_idx in enumerate(most_inf_idx):
        print "Topic:", topic_no
        # Lookup artist name
        print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['name'].iloc[0]
        # Lookup name of sample
        print os.listdir(bow_path_by_artist[artist_idx][1])[0]
        print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['main_genre'].iloc[0]
        print

1890
Topic: 0
Gabriel Fauré
0_Sicilenne.npy
Classical

Topic: 1
Sergey Rachmaninov
0_Prelude in C Sharp Minor Op. 32.npy
Classical

Topic: 2
Gabriel Fauré
0_Sicilenne.npy
Classical

Topic: 3
Gilbert & Sullivan
0_If You Want to Know Who We Are.npy
Stage & Screen

Topic: 4
Enrico Caruso
0_O Paradiso.npy
Classical

1900
Topic: 0
Jean Sibelius
0_Be Still My SoulWhat a Friend We Have in Jesus.npy
Classical

Topic: 1
Alphonse Picou
0_Playing Hot with Buddy Bolden.npy
Jazz

Topic: 2
Arturo Toscanini
0_Rhapsody In Blue (Intro & Theme) [From Manhattan].npy
Classical

Topic: 3
Bert Williams
0_The Moon Shines on the Moonshine.npy
Vocal

Topic: 4
Bert Williams
0_The Moon Shines on the Moonshine.npy
Vocal

1910
Topic: 0
Al Jolson
0_You Made Me Love You.npy
Vocal

Topic: 1
Frank Ferera
0_Melani Anu Ka Makani.npy
International

Topic: 2
Al Jolson
0_You Made Me Love You.npy
Vocal

Topic: 3
Al Jolson
0_You Made Me Love You.npy
Vocal

Topic: 4
Al Jolson
0_You Made Me Love You.npy
Vocal

1920
Topic: 0
Ja

# Correlation with AllMusic Influence Graph Degree

In [180]:
# Calculate mean DIM influence per artist
mean_dim_influences = np.array(song_topic_inf_matrix).mean(axis=1)

# Get list of artist outdegrees in same order
artist_ids_ordered = [t[0] for t in bow_path_by_artist]
outdegrees_ordered = []

for id in artist_ids_ordered:
    outdegrees_ordered.append(artists[artists['id'] == id]['outdegree'].iloc[0])

In [185]:
# Calculate correlation
spearmanr(mean_dim_influences, outdegrees_ordered)

SpearmanrResult(correlation=0.027736947349539173, pvalue=0.00084552732349784309)

# TODO: Baseline

# TODO: Breakdown by Genre