In [17]:
# Start with raw artist data (id, name, [{terms, freq}])
# Create artist matrix (per city!)
# Apply tf-idf --> ready for venue/ML pipeline
import pickle
import pymysql as mdb
import numpy as np

##### Define a few functions ##########

def getIndex(element, my_list):
    # Create a generator
    gen = (i for i,x in enumerate(my_list) if x == element)
    for i in gen:
        return i

# scales vector(list) so that entries sum to 1
def norm1(v):
    scaled_vector = []
    total_A = float(sum(v))
    for i in v:
        scaled_vector.append(i/total_A)
    return scaled_vector

##### Open up pickle files ######

artist_store1 = pickle.load( open( "pickle_artists/artist_info_0_5k.pickle", "rb" ) )
artist_store2 = pickle.load( open( "pickle_artists/artist_info_5k_10k.pickle", "rb" ) )
artist_store3 = pickle.load( open( "pickle_artists/artist_info_10k_15k.pickle", "rb" ) )
artist_store4 = pickle.load( open( "pickle_artists/artist_info_15k_20k.pickle", "rb" ) )

artist_store_all = artist_store1 + artist_store2 + artist_store3 + artist_store4

#print artist_store_all[18]
##### Create master list of unique genre names #######
master_genre_map = []
for artist in artist_store_all:
    for term in artist[2:]:
        if term['name'] not in master_genre_map:
            master_genre_map.append(term['name'])


master_artist_genre_freq = dict() # {'artist': np.array[]}

for i in range(len(artist_store_all)):
    # Initialize vector of zeroes for all genres
    genre_vec = [0] * len(master_genre_map)
    # Loop over genres, make vector of frequencies
    for term in artist_store_all[i][2:]:
        # Get index of genre in master_genre_map
        indx = getIndex(term['name'], master_genre_map)
        # Set corresponding vector entry to value of frequency
        genre_vec[indx] = term['frequency']
    # Set key to artist name, value is array of (normalized) freqs
    master_artist_genre_freq[artist_store_all[i][1]] = np.array(norm1(genre_vec))

#for key, val in master_artist_genre_freq.iteritems():
#    print key, val
print len(master_artist_genre_freq)


7316


In [32]:
##### Create artist vectors/tfidf for ust one metro region ####

# Create list of just artist in one metro region
regional_subset = dict()
regional_subset_id = dict()
region = '17835' # LA=17835, SF=26330, Chicago=9426, SD=11086, CIN=22040
city = 'LA'

con = mdb.connect('localhost', 'root', '', 'scenehash', autocommit=True, charset='utf8', use_unicode=True) 
cur = con.cursor()
with con:
    cur = con.cursor()
    cur.execute("SELECT artist_id, artist, metro_id FROM events WHERE metro_id = %s" % (region))
    all_events = cur.fetchall()
    for a_id, a_name, metro in all_events[:]:
        # Check for duplicates, then assign vector
        if a_name not in regional_subset:
            try:
                regional_subset[a_name] = master_artist_genre_freq[a_name]
                regional_subset_id[a_id] = master_artist_genre_freq[a_name]
            except Exception:
                continue

# OK, have dict of {artist: vector} for a metro region

print "Done."

Done.


In [33]:
import math

def tf(genre, artist):
    return regional_subset[artist][getIndex(genre, master_genre_map)]

def n_containing(genre, reg_list):
    indx = getIndex(genre, master_genre_map)
    return sum(1 for art, gvec in reg_list.iteritems() if gvec[indx] > 0)

def idf(genre, reg_list):
    return math.log(float(len(reg_list)) / (1 + n_containing(genre, reg_list)))

idf_list = []
for gnr in master_genre_map:
    idf_list.append(idf(gnr, regional_subset))
master_genre_idf = np.array(idf_list)

# Let's update each artist freq with IDF:
for artist in regional_subset.iterkeys():
    regional_subset[artist] = regional_subset[artist]*master_genre_idf

# regional_subset is ready for the venues!
pickle.dump( master_genre_idf, open( city+"/master_genre_idf.pickle", "wb" ) )
print "Done."


Done.


In [34]:
# Create list of just artist in one metro region
regional_venue_set = dict()
#region = '26330' # LA=17835, SF=26330, Chicago=9426, SD=11086

# Storage stuff
venue_list = []
venue_count = dict()
venue_ids = []
locations = []
event_ids = []

con = mdb.connect('localhost', 'root', '', 'scenehash', autocommit=True, charset='utf8', use_unicode=True) 
cur = con.cursor()
with con:
    cur = con.cursor()
    cur.execute("SELECT evt_id, venue_name, venue_id, artist, latitude, longitude FROM events WHERE metro_id = %s" % (region))
    all_events = cur.fetchall()
    for evtid, ven_name, ven_id, artist_name, lat, lon in all_events:
        # Check for duplicates, create zeroes if doesn't exist
        if ven_name not in regional_venue_set:
            venue_list.append(ven_name)
            venue_count[ven_name] = 1
            venue_ids.append(ven_id)
            locations.append([lat,lon])
            event_ids.append(evtid)
            regional_venue_set[ven_name] = np.array([0.] * len(master_genre_map))
        try:
            regional_venue_set[ven_name] +=  regional_subset[artist_name]
            venue_count[ven_name] += 1
        except Exception:
            continue

# Let's keep only those venue with 2 > events
regional_good_venues = dict()
for key, val in regional_venue_set.iteritems():
    if venue_count[key] > 2:
        regional_good_venues[key] = val


#for key, val in regional_venue_set.iteritems():
#    print key, val

print "Total number of venue names =", len(venue_list)
print "Total number of venue ids =", len(venue_ids)
print "Total number of master venues =", len(regional_venue_set)
print "Total number of lat/lons =", len(locations)
print "Total number of genres =", len(master_genre_map)
print "Total number of artists =", len(regional_subset)
print "Total number of events =", len(event_ids)
print "Total number of good venues =", len(regional_good_venues)
pickle.dump( venue_list, open( city+"/venue_list.pickle", "wb" ) )
pickle.dump( venue_ids, open( city+"/venue_ids.pickle", "wb" ) )
pickle.dump( locations, open( city+"/locations.pickle", "wb" ) )
pickle.dump( regional_venue_set, open( city+"/regional_venue_set.pickle", "wb" ) )
pickle.dump( regional_subset, open( city+"/regional_subset.pickle", "wb" ) )
pickle.dump( regional_subset_id, open( city+"/regional_subset_id.pickle", "wb" ) )
pickle.dump( master_genre_map, open( city+"/master_genre_map.pickle", "wb" ) )
pickle.dump( event_ids, open( city+"/event_ids.pickle", "wb" ) )
pickle.dump( regional_good_venues, open( city+"/regional_good_venues.pickle", "wb" ) )

'''
cur.execute("CREATE TABLE events"
            "(pid INT PRIMARY KEY AUTO_INCREMENT, "
            "evt_id INT, "
            "evt_name VARCHAR(400), "
            "venue_name VARCHAR(150), "
            "venue_id INT, "
            "city_name VARCHAR(50), "
            "metro_id INT, "
            "latitude VARCHAR(20), "
            "longitude VARCHAR(20), "
            "evt_url VARCHAR(200), "
            "artist VARCHAR(200), "
            "artist_id INT, "
            "artist_billing VARCHAR(100) ) "
            )
'''

Total number of venue names = 667
Total number of venue ids = 667
Total number of master venues = 667
Total number of lat/lons = 667
Total number of genres = 1035
Total number of artists = 3620
Total number of events = 667
Total number of good venues = 317


'\ncur.execute("CREATE TABLE events"\n            "(pid INT PRIMARY KEY AUTO_INCREMENT, "\n            "evt_id INT, "\n            "evt_name VARCHAR(400), "\n            "venue_name VARCHAR(150), "\n            "venue_id INT, "\n            "city_name VARCHAR(50), "\n            "metro_id INT, "\n            "latitude VARCHAR(20), "\n            "longitude VARCHAR(20), "\n            "evt_url VARCHAR(200), "\n            "artist VARCHAR(200), "\n            "artist_id INT, "\n            "artist_billing VARCHAR(100) ) "\n            )\n'

In [None]:
#Plot events per venue

import matplotlib.pyplot as plt


hold_vecs = []
for key, val in venue_count.iteritems():
    hold_vecs.append(val)

hold_vecs.sort(reverse=True)

plt.bar(range(len(hold_vecs)), hold_vecs )

plt.title("Events per venue (SD)")
plt.xlabel("Venue")
#plt.yscale('log', nonposy='clip')
plt.ylabel("Number of Events")
plt.show()

In [None]:
# Plot magnitude of venues' vector 

hold_vecs = []
#for key, val in regional_venue_set.iteritems(): # Before > 4 cuts
for key, val in regional_good_venues.iteritems(): # After cuts
    hold_vecs.append(np.sqrt(val.dot(val)))

hold_vecs.sort(reverse=True)

plt.bar(range(len(hold_vecs)), hold_vecs )

plt.title("Genre Magnitude per Venue")
plt.xlabel("Venue")
#plt.yscale('log', nonposy='clip')
plt.ylabel("Vector length")
plt.show()

In [None]:



###########
#########
########










print "Total number of venue names =", len(venue_list)
print "Total number of venue ids =", len(venue_ids)
print "Total number of master venues =", len(master_venue_vect)
print "Total number of lat/lons =", len(locations)
pickle.dump( venue_list, open( "venue_list.pickle", "wb" ) )
pickle.dump( venue_ids, open( "venue_ids.pickle", "wb" ) )
pickle.dump( locations, open( "locations.pickle", "wb" ) )
pickle.dump( genre_map, open( "genre_map.pickle", "wb" ) )

del venue_ids[0]
del venue_list[0] # oops
del locations[0]

plotting = master_venue_vect
    
# Let's normalize all genre vectors for venues
for i in range(len(master_venue_vect)):
    sum_sq = 0
    for component in master_venue_vect[i]:
        sum_sq += component*component
    denom = math.sqrt(sum_sq)
    # Now go back through and divide each by denom
    for j in range(len(master_venue_vect[i])):
        master_venue_vect[i][j] = master_venue_vect[i][j]/denom


pickle.dump( master_venue_vect, open( "master_venues.pickle", "wb" ) )

In [None]:
hold_vecs = [0] * len(master_venue_vect[0])
for i in range(len(master_venue_vect)):
    for j in range(len(hold_vecs)):
        hold_vecs[j] += master_venue_vect[i][j]

hold_vecs.sort(reverse=True)
#print hold_vecs

import numpy as np
#import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

# the histogram of the data
#plt.hist(hold_vecs, bins=len(hold_vecs), histtype='stepfilled', normed=False, color='b', label='Gaussian')
plt.bar(range(len(hold_vecs)), hold_vecs )

#plt.xlabel('Smarts')
#plt.ylabel('Probability')
#plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=100,\ \sigma=15$')
#plt.axis([40, 160, 0, 0.03])
#plt.grid(True)
plt.title("Genre Presence")
plt.xlabel("Genre")
plt.yscale('log', nonposy='clip')
plt.ylabel("Net frequency")
plt.show()