Skip to content

Commit

Permalink
pep8 compliant
Browse files Browse the repository at this point in the history
  • Loading branch information
tbertinmahieux committed Mar 26, 2011
1 parent a5c6edf commit c577c80
Showing 1 changed file with 102 additions and 72 deletions.
174 changes: 102 additions & 72 deletions Tasks_Demos/SQLite/create_track_metadata_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def encode_string(s):
EXAMPLE:
That's my boy! -> 'That''s my boy!'
"""
return "'"+s.replace("'","''")+"'"
return "'" + s.replace("'", "''") + "'"


def create_db(filename):
Expand All @@ -57,9 +57,11 @@ def create_db(filename):
conn = sqlite3.connect(filename)
# add stuff
c = conn.cursor()
q = 'CREATE TABLE songs (track_id text PRIMARY KEY, title text, song_id text, '
q = 'CREATE TABLE songs (track_id text PRIMARY KEY, '
q += 'title text, song_id text, '
q += 'release text, artist_id text, artist_mbid text, artist_name text, '
q += 'duration real, artist_familiarity real, artist_hotttnesss real, year int, '
q += 'duration real, artist_familiarity real, '
q += 'artist_hotttnesss real, year int, '
q += 'track_7digitalid int, shs_perf int, shs_work int)'
c.execute(q)
# commit and close
Expand All @@ -68,8 +70,7 @@ def create_db(filename):
conn.close()



def fill_from_h5(conn,h5path,verbose=0):
def fill_from_h5(conn, h5path, verbose=0):
"""
Add a row with he information from this .h5 file
Doesn't commit, doesn't close conn at the end!
Expand All @@ -81,27 +82,27 @@ def fill_from_h5(conn,h5path,verbose=0):
track_id = get_track_id(h5)
q += encode_string(track_id)
title = get_title(h5)
q += ', '+encode_string(title)
q += ', ' + encode_string(title)
song_id = get_song_id(h5)
q += ', '+encode_string(song_id)
q += ', ' + encode_string(song_id)
release = get_release(h5)
q += ', '+encode_string(release)
q += ', ' + encode_string(release)
artist_id = get_artist_id(h5)
q += ', '+encode_string(artist_id)
q += ', ' + encode_string(artist_id)
artist_mbid = get_artist_mbid(h5)
q += ', '+encode_string(artist_mbid)
q += ', ' + encode_string(artist_mbid)
artist_name = get_artist_name(h5)
q += ', '+encode_string(artist_name)
q += ', ' + encode_string(artist_name)
duration = get_duration(h5)
q += ", "+str(duration) if not np.isnan(duration) else ",-1"
q += ", " + str(duration) if not np.isnan(duration) else ",-1"
familiarity = get_artist_familiarity(h5)
q += ", "+str(familiarity) if not np.isnan(familiarity) else ",-1"
q += ", " + str(familiarity) if not np.isnan(familiarity) else ",-1"
hotttnesss = get_artist_hotttnesss(h5)
q += ", "+str(hotttnesss) if not np.isnan(hotttnesss) else ",-1"
q += ", " + str(hotttnesss) if not np.isnan(hotttnesss) else ",-1"
year = get_year(h5)
q += ", "+str(year)
q += ", " + str(year)
track_7digitalid = get_track_7digitalid(h5)
q += ", "+str(track_7digitalid)
q += ", " + str(track_7digitalid)
# add empty fields for shs perf than work
q += ", -1, 0"
# query done, close h5, commit
Expand All @@ -114,70 +115,94 @@ def fill_from_h5(conn,h5path,verbose=0):
c.close()


def add_indices_to_db(conn,verbose=0):
def add_indices_to_db(conn, verbose=0):
"""
Since the db is considered final, we can add all sorts of indecies
to make sure the retrieval time is as fast as possible.
Indecies take up a little space, but they hurt performance only when
we modify the data (which should not happen)
This function commits its changes at the end
You might want to add your own indices if you do weird query, e.g. on title
and artist musicbrainz ID.
Indices should be on the columns of the WHERE of your search, the goal is to
quickly find the few rows that match the query. The index does not care of the
field (column) you actually want, finding the row is the important step.
Indices should be on the columns of the WHERE of your search, the goal
is to quickly find the few rows that match the query. The index does not
care of the field (column) you actually want, finding the row is the
important step.
track_id is implicitely indexed as it is the PRIMARY KEY of the table.
Note: tutorial on MySQL (close enough to SQLite):
http://www.databasejournal.com/features/mysql/article.php/10897_1382791_1/Optimizing-MySQL-Queries-and-Indexes.htm
http://www.databasejournal.com/features/mysql/article.php/10897_1382791_1/
Optimizing-MySQL-Queries-and-Indexes.htm
"""
c = conn.cursor()
# index to search by (artist_id) or by (artist_id,release)
q = "CREATE INDEX idx_artist_id ON songs ('artist_id','release')"
if verbose > 0: print q
if verbose > 0:
print q
c.execute(q)
# index to search by (artist_mbid) or by (artist_mbid,release)
q = "CREATE INDEX idx_artist_mbid ON songs ('artist_mbid','release')"
if verbose > 0: print q
if verbose > 0:
print q
c.execute(q)
# index to search by (artist_familiarity) or by (artist_familiarity,artist_hotttnesss)
q = "CREATE INDEX idx_familiarity ON songs ('artist_familiarity','artist_hotttnesss')"
if verbose > 0: print q
# index to search by (artist_familiarity)
# or by (artist_familiarity,artist_hotttnesss)
q = "CREATE INDEX idx_familiarity ON songs "
q += "('artist_familiarity','artist_hotttnesss')"
if verbose > 0:
print q
c.execute(q)
# index to search by (artist_hotttnesss) or by (artist_hotttnesss,artist_familiarity)
q = "CREATE INDEX idx_hotttnesss ON songs ('artist_hotttnesss','artist_familiarity')"
if verbose > 0: print q
# index to search by (artist_hotttnesss)
# or by (artist_hotttnesss,artist_familiarity)
q = "CREATE INDEX idx_hotttnesss ON songs "
q += "('artist_hotttnesss','artist_familiarity')"
if verbose > 0:
print q
c.execute(q)
# index to search by (artist_name) or by (artist_name,title) or by (artist_name,title,release)
q = "CREATE INDEX idx_artist_name ON songs ('artist_name','title','release')"
if verbose > 0: print q
# index to search by (artist_name)
# or by (artist_name,title) or by (artist_name,title,release)
q = "CREATE INDEX idx_artist_name ON songs "
q += "('artist_name','title','release')"
if verbose > 0:
print q
c.execute(q)
# index to search by (title) or by (title,artist_name) or by (title,artist_name,release)
# index to search by (title)
# or by (title,artist_name) or by (title,artist_name,release)
q = "CREATE INDEX idx_title ON songs ('title','artist_name','release')"
if verbose > 0: print q
if verbose > 0:
print q
c.execute(q)
# index to search by (release) or by (release,artist_name) or by (release,artist_name,title)
# index to search by (release)
# or by (release,artist_name) or by (release,artist_name,title)
q = "CREATE INDEX idx_release ON songs ('release','artist_name','title')"
if verbose > 0: print q
# index to search by (duration) or by (duration,artist_id)
if verbose > 0:
print q
# index to search by (duration)
# or by (duration,artist_id)
q = "CREATE INDEX idx_duration ON songs ('duration','artist_id')"
if verbose > 0: print q
if verbose > 0:
print q
c.execute(q)
# index to search by (year) or by (year,artist_id) or by (year,artist_id,title)
# index to search by (year)
# or by (year,artist_id) or by (year,artist_id,title)
q = "CREATE INDEX idx_year ON songs ('year','artist_id','title')"
if verbose > 0: print q
if verbose > 0:
print q
c.execute(q)
# index to search by (year) or by (year,artist_name)
q = "CREATE INDEX idx_year2 ON songs ('year','artist_name')"
if verbose > 0: print q
if verbose > 0:
print q
c.execute(q)
# index to search by (shs_work)
q = "CREATE INDEX idx_shs_work ON songs ('shs_work')"
if verbose > 0: print q
if verbose > 0:
print q
c.execute(q)
# index to search by (shs_perf)
q = "CREATE INDEX idx_shs_perf ON songs ('shs_perf')"
if verbose > 0: print q
if verbose > 0:
print q
c.execute(q)
# done, commit
conn.commit()
Expand All @@ -187,12 +212,13 @@ def die_with_usage():
""" HELP MENU """
print 'Command to create the track_metadata SQLite database'
print 'to launch (it might take a while!):'
print ' python create_track_metadata_db.py [FLAGS] <MillionSong dir> <track_metadata.db>'
print ' python create_track_metadata_db.py [FLAGS] <MSD dir> <tmdb>'
print 'PARAMS'
print ' MillionSong dir - directory containing .h5 song files in sub dirs'
print ' track_metadata.db - filename for the database'
print ' MSD dir - directory containing .h5 song files in sub dirs'
print ' tmdb - filename for the database (track_metadata.db)'
print 'FLAGS'
print ' -shsdata f - file containing the SHS dataset (concatenate train and test)'
print ' -shsdata f - file containing the SHS dataset'
print ' (you can simply concatenate train and test)'
print ' -verbose - print every query'
sys.exit(0)

Expand All @@ -205,12 +231,13 @@ def die_with_usage():

# import HDF5 stuff
# yes, it is worth of a WTF like this last one:
# http://thedailywtf.com/Articles/CompareObjectAsIAlertDocumentOrNullIfNotCastable-and-More.aspx
# http://thedailywtf.com/
# Articles/CompareObjectAsIAlertDocumentOrNullIfNotCastable-and-More.aspx
# but I plan to buy some bad code offsets anyway
# http://codeoffsets.com/
pythonsrc = os.path.join(sys.argv[0],'../../../PythonSrc')
pythonsrc = os.path.abspath( pythonsrc )
sys.path.append( pythonsrc )
pythonsrc = os.path.join(sys.argv[0], '../../../PythonSrc')
pythonsrc = os.path.abspath(pythonsrc)
sys.path.append(pythonsrc)
from hdf5_getters import *

verbose = 0
Expand Down Expand Up @@ -247,21 +274,21 @@ def die_with_usage():

# open connection
conn = sqlite3.connect(dbfile)

# iterate HDF5 files
cnt_files = 0
for root, dirs, files in os.walk(maindir):
files = glob.glob(os.path.join(root,'*.h5'))
for f in files :
fill_from_h5(conn,f,verbose=verbose)
files = glob.glob(os.path.join(root, '*.h5'))
for f in files:
fill_from_h5(conn, f, verbose=verbose)
cnt_files += 1
if cnt_files % 50 == 0:
conn.commit() # we commit only every 50 files!
if cnt_files % 200 == 0:
conn.commit() # we commit only every 200 files!
conn.commit()
t2 = time.time()
stimelength = str(datetime.timedelta(seconds=t2-t1))
print 'added the content of',cnt_files,'files to database:',dbfile
print 'it took:',stimelength
stimelength = str(datetime.timedelta(seconds=t2 - t1))
print 'added the content of', cnt_files, 'files to database:', dbfile
print 'it took:', stimelength

# add SHS data
if shsdataset != '':
Expand All @@ -275,32 +302,36 @@ def die_with_usage():
continue
# work
if line[0] == '%':
works = map(lambda w: int(w), line[1:].split(' ')[0].split(',')[:-1])
works = map(lambda w: int(w),
line[1:].split(' ')[0].split(',')[:-1])
work = min(works)
continue
# regular line
tid,aid,perf = line.strip().split('<SEP>')
tid, aid, perf = line.strip().split('<SEP>')
q = "UPDATE songs SET shs_perf=" + perf + ", shs_work=" + str(work)
q += " WHERE track_id='" + tid + "'"
if verbose > 0: print q
if verbose > 0:
print q
conn.execute(q)
# iteration done
shs.close()
conn.commit()



# add indices
c = conn.cursor()
res = c.execute('SELECT Count(*) FROM songs')
nrows_before = res.fetchall()[0][0]
add_indices_to_db(conn,verbose=verbose)
add_indices_to_db(conn, verbose=verbose)
res = c.execute('SELECT Count(*) FROM songs')
nrows_after = res.fetchall()[0][0]
c.close()
assert nrows_before == nrows_after,'you lost rows during indexing???' # sanity check
# sanity check
assert nrows_before == nrows_after, 'Lost rows during indexing?'
if nrows_before != 1000000:
print 'we got',nrows_before,'rows, this is not the full MillionSongDataset, just checking...'
print '*********************************************************'
print 'We got', nrows_before, 'rows.'
print 'This is not the full MillionSongDataset! just checking...'
print '*********************************************************'

# close connection
conn.close()
Expand All @@ -309,7 +340,6 @@ def die_with_usage():
t3 = time.time()

# DONE
print 'done! (indices included) database:',dbfile
stimelength = str(datetime.timedelta(seconds=t3-t1))
print 'done! (indices included) database:', dbfile
stimelength = str(datetime.timedelta(seconds=t3 - t1))
print 'execution time:', stimelength

0 comments on commit c577c80

Please sign in to comment.