In [15]:
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
import numpy as np
import pandas as pd
import os
from copy import copy
from collections import OrderedDict
from scipy.stats import spearmanr
from tqdm import tqdm_notebook
import pickle

In [2]:
MODEL_PATH = '../models/dim_bow1000_10topics_firsttrack_songreleasedate'
BOW_DIR = '../data/features/bow_1000/'

In [3]:
# Load model
dim_model = DtmModel.load(MODEL_PATH)

In [4]:
# Load artist info
artists = pd.read_csv('../data/allmusic/artists_cleaned.csv')

In [5]:
# Load song info
songs = pd.read_csv('../data/artist_song_list_years_cleaned.csv')

In [6]:
# Load bow_path_by_artist
bow_path_by_artist = pickle.load(open('../models/bow500_path_by_artist.pk', 'rb'))

In [7]:
bow_path_by_artist_new = []

for (id, path, year) in bow_path_by_artist:
    bow_path_by_artist_new.append((id, BOW_DIR + path.split('/n/regal/rush_lab/xue/bow_500/')[1], year))

bow_path_by_artist = bow_path_by_artist_new

In [8]:
# # Add a column with a ".npy" extension
# songs['name_ext'] = songs['name_no_ext'].apply(lambda x: (x + '.npy')).str.decode('utf-8')

# # list of (artist_id, path, year) tuples
# bow_path_by_artist = []

# ids_in_songs = set(songs['artist_id'])

# for artist_id in tqdm_notebook(os.listdir(BOW_DIR)):
#     song_filename = os.listdir(BOW_DIR + artist_id)[0].decode('utf-8')
    
#     # Check if song year is missing or artist is missing from song df
#     if int(artist_id) in ids_in_songs and songs[songs['name_ext'] == song_filename]['year'].iloc[0] != 0:
#         # save (artist_id, path, year) tuple
#         bow_path_by_artist.append((int(artist_id), BOW_DIR + artist_id + '/', songs[songs['name_ext'] == song_filename]['year'].iloc[0]))

# print "Number of songs:", len(bow_path_by_artist)

# Order list by year
bow_path_by_artist.sort(key= lambda x: songs[songs['artist_id'] == x[0]]['year'].iloc[0])

# Create counter for number of songs for each year 
year_counter = {int(k) : 0 for k in np.unique(zip(*bow_path_by_artist)[2])}

for id, path, year in bow_path_by_artist:
    year_counter[year] += 1

# Lookup table for time_slice index v. year
time_slice_dict = {idx : year for (idx, year) in enumerate(sorted(year_counter))}
# List of counts for each time slice for DIM
time_seq = [year_counter[key] for key in sorted(year_counter.keys())]

print "Count of artists per each time slice:"
print time_seq

Count of artists per each time slice:
[1, 1, 1, 1, 3, 1, 1, 1, 5, 3, 2, 2, 1, 2, 2, 1, 1, 2, 2, 5, 2, 2, 7, 2, 5, 5, 5, 6, 4, 6, 11, 15, 30, 42, 37, 64, 44, 44, 42, 52, 62, 49, 84, 73, 100, 113, 117, 112, 94, 135, 103, 133, 126, 131, 112, 141, 152, 167, 155, 156, 160, 141, 164, 171, 172, 182, 175, 194, 211, 203, 266, 272, 324, 360, 361, 431, 361, 382, 410, 407, 444, 440, 407, 420, 406, 410, 318, 350, 272, 299, 377, 390, 277, 264, 188, 3]


# Get most influential songs for each topic

In [9]:
# Create (num_songs, num_topics) array of influence scores
song_topic_inf_matrix = []

for time_idx in range(len(dim_model.time_slices)):    
    for song_idx in range(dim_model.time_slices[time_idx]):
        song_inf_by_topic = [None] * dim_model.num_topics

        for topic_idx in range(dim_model.num_topics):
            song_inf_by_topic[topic_idx] = dim_model.influences_time[time_idx][song_idx][topic_idx]
        
        song_topic_inf_matrix.append(song_inf_by_topic)

In [10]:
# Get the indices of the most influential songs per topic
most_inf_idx = np.array(song_topic_inf_matrix).argmax(axis=0)

for topic_no, artist_idx in enumerate(most_inf_idx):
    print "Topic:", topic_no
    # Lookup artist name
    print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['name'].iloc[0]
    # Lookup name of sample
    print os.listdir(bow_path_by_artist[artist_idx][1])[0]
    
    print

Topic: 0
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy

Topic: 1
Sioux Falls
0_CopyPaste.npy

Topic: 2
Hound Dog Taylor & the Houserockers
0_The Sun Is Shining.npy

Topic: 3
Big Jaz
0_Jigga WhatFaint.npy

Topic: 4
Sioux Falls
0_CopyPaste.npy

Topic: 5
Big Jaz
0_Jigga WhatFaint.npy

Topic: 6
Warm Brew
0_Chiefing In the Streets.npy

Topic: 7
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy

Topic: 8
Carlinhos Brown
0_Passe Em Casa.npy

Topic: 9
Clyde Stubblefield
0_I Don't Want Nobody to Give Me Nothin'.npy



# Get most influential songs per topic per epoch

In [11]:
song_topic_inf_by_year = []
song_topic_inf_matrix_copy = copy(song_topic_inf_matrix)

# Slice song-topic matrix into list of submatrices keyed by time slice
for time_idx, num_in_slice in enumerate(dim_model.time_slices):
    song_topic_inf_by_year.append(song_topic_inf_matrix_copy[:num_in_slice])
    del song_topic_inf_matrix_copy[:num_in_slice]

In [12]:
for idx, song_topic_matrix in enumerate(song_topic_inf_by_year):
    print time_slice_dict[idx], ':', time_seq[idx], 'songs'
    print "==============================="

    # Get the indices of the most influential songs per topic
    # Note: Need to adjust indexing since index resets to 0 for each new year
    most_inf_idx = np.array(song_topic_matrix).argmax(axis=0) + sum(dim_model.time_slices[:idx])

    for topic_no, artist_idx in enumerate(most_inf_idx):
        print "Topic:", topic_no
        # Lookup artist name
        print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['name'].iloc[0]
        # Lookup name of sample
        print os.listdir(bow_path_by_artist[artist_idx][1])[0]
        print artists[artists['id'] == bow_path_by_artist[artist_idx][0]]['main_genre'].iloc[0]
        print

1912 : 1 songs
Topic: 0
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 1
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 2
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 3
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 4
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 5
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 6
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 7
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 8
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

Topic: 9
Cex
0_I Am SoundboyWe Started This (Acapella)Everlasting Life.npy
Electronic

1916 : 1 songs
Topic: 0
Sioux Falls
0_CopyPaste.npy
Pop/Rock

Topic: 1
Sioux Falls
0_CopyPaste.npy
Pop/Rock

Topic:

Topic: 4
Richard Greene
0_Mary of the Wild Moors.npy
Country

Topic: 5
Richard Greene
0_Mary of the Wild Moors.npy
Country

Topic: 6
Richard Greene
0_Mary of the Wild Moors.npy
Country

Topic: 7
Richard Greene
0_Mary of the Wild Moors.npy
Country

Topic: 8
Richard Greene
0_Mary of the Wild Moors.npy
Country

Topic: 9
Richard Greene
0_Mary of the Wild Moors.npy
Country

1933 : 1 songs
Topic: 0
Eliot Lipp
0_Oneknot.npy
Electronic

Topic: 1
Eliot Lipp
0_Oneknot.npy
Electronic

Topic: 2
Eliot Lipp
0_Oneknot.npy
Electronic

Topic: 3
Eliot Lipp
0_Oneknot.npy
Electronic

Topic: 4
Eliot Lipp
0_Oneknot.npy
Electronic

Topic: 5
Eliot Lipp
0_Oneknot.npy
Electronic

Topic: 6
Eliot Lipp
0_Oneknot.npy
Electronic

Topic: 7
Eliot Lipp
0_Oneknot.npy
Electronic

Topic: 8
Eliot Lipp
0_Oneknot.npy
Electronic

Topic: 9
Eliot Lipp
0_Oneknot.npy
Electronic

1935 : 2 songs
Topic: 0
Hound Dog Taylor & the Houserockers
0_The Sun Is Shining.npy
Blues

Topic: 1
Amy Vachal
0_Blank Space.npy
Pop/Rock

Topic: 2
Houn

Pete Nice
0_Fallin Spiders.npy
Rap

Topic: 4
Little Mike & the Tornadoes
0_What About Love.npy
Blues

Topic: 5
Little Mike & the Tornadoes
0_What About Love.npy
Blues

Topic: 6
Stresmatic
0_Knockin' at the Light.npy
Rap

Topic: 7
Pete Nice
0_Fallin Spiders.npy
Rap

Topic: 8
The Intruders
0_Just the Lonely Talkin' Again.npy
R&B;

Topic: 9
Moonshine Bandits
0_Mud Digger Mega Remix.npy
Rap

1960 : 44 songs
Topic: 0
Chill Rob G.
0_PowerSign Off.npy
Rap

Topic: 1
The Melodians
0_Everybody Bawling.npy
Reggae

Topic: 2
The Melodians
0_Everybody Bawling.npy
Reggae

Topic: 3
Oliver Morgan
0_Trick Bag.npy
R&B;

Topic: 4
Chill Rob G.
0_PowerSign Off.npy
Rap

Topic: 5
Rita Lee
0_Refestança.npy
Latin

Topic: 6
The Royal Guardsmen
0_Imperial Bedroom.npy
Pop/Rock

Topic: 7
Chill Rob G.
0_PowerSign Off.npy
Rap

Topic: 8
Chill Rob G.
0_PowerSign Off.npy
Rap

Topic: 9
Hints
0_Foothills.npy
Pop/Rock

1961 : 42 songs
Topic: 0
Tito Nieves
0_Sonámbulo.npy
Latin

Topic: 1
Jordan Fisher
0_Gotta Be Me.npy
Po

Topic: 2
Michael Nesmith
0_Joanne.npy
Pop/Rock

Topic: 3
Shel Silverstein
0_A Front Row Seat to Hear Ole Johnny Sing.npy
Folk

Topic: 4
Michael Nesmith
0_Joanne.npy
Pop/Rock

Topic: 5
Jethro Tull
0_My God.npy
Pop/Rock

Topic: 6
Shel Silverstein
0_A Front Row Seat to Hear Ole Johnny Sing.npy
Folk

Topic: 7
Shel Silverstein
0_A Front Row Seat to Hear Ole Johnny Sing.npy
Folk

Topic: 8
Michael Nesmith
0_Joanne.npy
Pop/Rock

Topic: 9
Los del Rio
0_La Canto.npy
Latin

1978 : 141 songs
Topic: 0
Dallas Frazier
0_The Laying on of Hands.npy
Country

Topic: 1
Johnny Nash
0_Stir It Up.npy
R&B;

Topic: 2
Armando Peraza
0_Geechee Girl.npy
Jazz

Topic: 3
Son Seals
0_We Wish You a Merry Christmas.npy
Blues

Topic: 4
Paul Simon
0_Kodachrome.npy
Pop/Rock

Topic: 5
Thom Bell
0_You'll Never Get to Heaven (If You Break My Heart).npy
R&B;

Topic: 6
Johnny Nash
0_Stir It Up.npy
R&B;

Topic: 7
Thom Bell
0_You'll Never Get to Heaven (If You Break My Heart).npy
R&B;

Topic: 8
G. Love
0_Rainbow.npy
Pop/Rock

To

Topic: 2
Armando
0_Robotics.npy
Electronic

Topic: 3
Armando
0_Robotics.npy
Electronic

Topic: 4
Vibro Champs
0_Martian Trip.npy
Pop/Rock

Topic: 5
Peter Laughner
0_Amphetamine.npy
Pop/Rock

Topic: 6
Mike Stern
0_King of the Lobby.npy
Jazz

Topic: 7
Kid Frost
0_Rap Declares War.npy
Rap

Topic: 8
Armando
0_Robotics.npy
Electronic

Topic: 9
Lunachicks
0_Superstrong.npy
Pop/Rock

1996 : 360 songs
Topic: 0
Rampage
0_Flava in Ya Ear.npy
Rap

Topic: 1
Bushwick Bill
0_Stranded on Death Row.npy
Rap

Topic: 2
Tracy Byrd
0_Watermelon Crawl.npy
Country

Topic: 3
Mother Earth
0_Jesse.npy
Electronic

Topic: 4
Dinah Washington
0_Embraceable You.npy
Vocal

Topic: 5
Mother Earth
0_Jesse.npy
Electronic

Topic: 6
Wayman Tisdale
0_You.npy
Jazz

Topic: 7
Margaret Cho
0_[Untitled].npy
Comedy/Spoken

Topic: 8
Jon Secada
0_The Best Is Yet to Come.npy
R&B;

Topic: 9
Thee Headcoatees
0_Gotta Get The First Plane Home.npy
Pop/Rock

1997 : 361 songs
Topic: 0
MC Lyte
0_Cold Rock a Party.npy
Rap

Topic: 1
Eels
0_No

Topic: 6
Prefab Sprout
0_Devil Came a Calling.npy
Pop/Rock

Topic: 7
Lorn
0_Chhurch.npy
Electronic

Topic: 8
Mel Lewis Orchestra
0_Trouble in Mind.npy
Jazz

Topic: 9
Carl Thomas
0_One Name.npy
R&B;

2014 : 390 songs
Topic: 0
DJ Quik
0_Shine.npy
Rap

Topic: 1
July Talk
0_Paper Girl.npy
Pop/Rock

Topic: 2
Nels Cline
0_Lotus.npy
Jazz

Topic: 3
Kay Starr
0_(Everybody's Waitin' For) The Man With the Bag.npy
Vocal

Topic: 4
Copeland
0_Pin Your Wings.npy
Pop/Rock

Topic: 5
Kay Starr
0_(Everybody's Waitin' For) The Man With the Bag.npy
Vocal

Topic: 6
Lord RAJA
0_Gottfried Semper.npy
Electronic

Topic: 7
Shift K3Y
0_Make It Good.npy
Electronic

Topic: 8
Anderson Paak
0_Stand For Something.npy
R&B;

Topic: 9
DJ Quik
0_Shine.npy
Rap

2015 : 277 songs
Topic: 0
Micah P. Hinson
0_Beneath the Rose.npy
Pop/Rock

Topic: 1
Joe Hertler & the Rainbow Seekers
0_King is Dead.npy
Pop/Rock

Topic: 2
Tigercats
0_Junior Champion.npy
Pop/Rock

Topic: 3
The Internet
0_Girl.npy
R&B;

Topic: 4
Meghan Trainor
0_All

# Correlation with AllMusic Influence Graph Degree

In [13]:
# Calculate max DIM influence per artist
max_dim_influences = np.array(song_topic_inf_matrix).max(axis=1)

# Get list of artist outdegrees in same order
artist_ids_ordered = [t[0] for t in bow_path_by_artist]
outdegrees_ordered = []

for id in artist_ids_ordered:
    outdegrees_ordered.append(artists[artists['id'] == id]['outdegree'].iloc[0])

In [18]:
# Calculate max DIM influence per artist
max_dim_influences = np.array(song_topic_inf_matrix).max(axis=1)
artist_ids_ordered = [t[0] for t in bow_path_by_artist]
unique_artist_ids = np.unique([t[0] for t in bow_path_by_artist])

max_influences_by_artist = OrderedDict()
avg_max_influences_by_artist = OrderedDict()

for id in unique_artist_ids:
    max_influences_by_artist[id] = []

for id, inf in zip(artist_ids_ordered, max_dim_influences):
    max_influences_by_artist[id].append(inf)

for id in unique_artist_ids:
    avg_max_influences_by_artist[id] = np.mean(max_influences_by_artist[id])

outdegrees_ordered = []

for id in unique_artist_ids:
    outdegrees_ordered.append(artists[artists['id'] == id]['outdegree'].iloc[0])

In [24]:
# Calculate correlation
spearmanr(avg_max_influences_by_artist.values(), outdegrees_ordered)

SpearmanrResult(correlation=0.056398945617498877, pvalue=6.5418264055223765e-11)

# TODO: Baseline

# TODO: Breakdown by Genre