pep8 compliant

tbertinmahieux · Mar 26, 2011 · c577c80 · c577c80
1 parent a5c6edf
commit c577c80
Showing 1 changed file with 102 additions and 72 deletions.
diff --git a/Tasks_Demos/SQLite/create_track_metadata_db.py b/Tasks_Demos/SQLite/create_track_metadata_db.py
@@ -46,7 +46,7 @@ def encode_string(s):
     EXAMPLE:
       That's my boy! -> 'That''s my boy!'
     """
-    return "'"+s.replace("'","''")+"'"
+    return "'" + s.replace("'", "''") + "'"
 
 
 def create_db(filename):
@@ -57,9 +57,11 @@ def create_db(filename):
     conn = sqlite3.connect(filename)
     # add stuff
     c = conn.cursor()
-    q = 'CREATE TABLE songs (track_id text PRIMARY KEY, title text, song_id text, '
+    q = 'CREATE TABLE songs (track_id text PRIMARY KEY, '
+    q += 'title text, song_id text, '
     q += 'release text, artist_id text, artist_mbid text, artist_name text, '
-    q += 'duration real, artist_familiarity real, artist_hotttnesss real, year int, '
+    q += 'duration real, artist_familiarity real, '
+    q += 'artist_hotttnesss real, year int, '
     q += 'track_7digitalid int, shs_perf int, shs_work int)'
     c.execute(q)
     # commit and close
@@ -68,8 +70,7 @@ def create_db(filename):
     conn.close()
 
 
-
-def fill_from_h5(conn,h5path,verbose=0):
+def fill_from_h5(conn, h5path, verbose=0):
     """
     Add a row with he information from this .h5 file
     Doesn't commit, doesn't close conn at the end!
@@ -81,27 +82,27 @@ def fill_from_h5(conn,h5path,verbose=0):
     track_id = get_track_id(h5)
     q += encode_string(track_id)
     title = get_title(h5)
-    q += ', '+encode_string(title)
+    q += ', ' + encode_string(title)
     song_id = get_song_id(h5)
-    q += ', '+encode_string(song_id)
+    q += ', ' + encode_string(song_id)
     release = get_release(h5)
-    q += ', '+encode_string(release)
+    q += ', ' + encode_string(release)
     artist_id = get_artist_id(h5)
-    q += ', '+encode_string(artist_id)
+    q += ', ' + encode_string(artist_id)
     artist_mbid = get_artist_mbid(h5)
-    q += ', '+encode_string(artist_mbid)
+    q += ', ' + encode_string(artist_mbid)
     artist_name = get_artist_name(h5)
-    q += ', '+encode_string(artist_name)
+    q += ', ' + encode_string(artist_name)
     duration = get_duration(h5)
-    q += ", "+str(duration) if not np.isnan(duration) else ",-1"
+    q += ", " + str(duration) if not np.isnan(duration) else ",-1"
     familiarity = get_artist_familiarity(h5)
-    q += ", "+str(familiarity) if not np.isnan(familiarity) else ",-1"
+    q += ", " + str(familiarity) if not np.isnan(familiarity) else ",-1"
     hotttnesss = get_artist_hotttnesss(h5)
-    q += ", "+str(hotttnesss) if not np.isnan(hotttnesss) else ",-1"
+    q += ", " + str(hotttnesss) if not np.isnan(hotttnesss) else ",-1"
     year = get_year(h5)
-    q += ", "+str(year)
+    q += ", " + str(year)
     track_7digitalid = get_track_7digitalid(h5)
-    q += ", "+str(track_7digitalid)
+    q += ", " + str(track_7digitalid)
     # add empty fields for shs perf than work
     q += ", -1, 0"
     # query done, close h5, commit
@@ -114,70 +115,94 @@ def fill_from_h5(conn,h5path,verbose=0):
     c.close()
 
 
-def add_indices_to_db(conn,verbose=0):
+def add_indices_to_db(conn, verbose=0):
     """
     Since the db is considered final, we can add all sorts of indecies
     to make sure the retrieval time is as fast as possible.
     Indecies take up a little space, but they hurt performance only when
     we modify the data (which should not happen)
     This function commits its changes at the end
-    
+
     You might want to add your own indices if you do weird query, e.g. on title
     and artist musicbrainz ID.
-    Indices should be on the columns of the WHERE of your search, the goal is to
-    quickly find the few rows that match the query. The index does not care of the
-    field (column) you actually want, finding the row is the important step.
+    Indices should be on the columns of the WHERE of your search, the goal
+    is to quickly find the few rows that match the query. The index does not
+    care of the field (column) you actually want, finding the row is the
+    important step.
     track_id is implicitely indexed as it is the PRIMARY KEY of the table.
     Note: tutorial on MySQL (close enough to SQLite):
-    http://www.databasejournal.com/features/mysql/article.php/10897_1382791_1/Optimizing-MySQL-Queries-and-Indexes.htm
+    http://www.databasejournal.com/features/mysql/article.php/10897_1382791_1/
+                                   Optimizing-MySQL-Queries-and-Indexes.htm
     """
     c = conn.cursor()
     # index to search by (artist_id) or by (artist_id,release)
     q = "CREATE INDEX idx_artist_id ON songs ('artist_id','release')"
-    if verbose > 0: print q
+    if verbose > 0:
+        print q
     c.execute(q)
     # index to search by (artist_mbid) or by (artist_mbid,release)
     q = "CREATE INDEX idx_artist_mbid ON songs ('artist_mbid','release')"
-    if verbose > 0: print q
+    if verbose > 0:
+        print q
     c.execute(q)
-    # index to search by (artist_familiarity) or by (artist_familiarity,artist_hotttnesss)
-    q = "CREATE INDEX idx_familiarity ON songs ('artist_familiarity','artist_hotttnesss')"
-    if verbose > 0: print q
+    # index to search by (artist_familiarity)
+    # or by (artist_familiarity,artist_hotttnesss)
+    q = "CREATE INDEX idx_familiarity ON songs "
+    q += "('artist_familiarity','artist_hotttnesss')"
+    if verbose > 0:
+        print q
     c.execute(q)
-    # index to search by (artist_hotttnesss) or by (artist_hotttnesss,artist_familiarity)
-    q = "CREATE INDEX idx_hotttnesss ON songs ('artist_hotttnesss','artist_familiarity')"
-    if verbose > 0: print q
+    # index to search by (artist_hotttnesss)
+    # or by (artist_hotttnesss,artist_familiarity)
+    q = "CREATE INDEX idx_hotttnesss ON songs "
+    q += "('artist_hotttnesss','artist_familiarity')"
+    if verbose > 0:
+        print q
     c.execute(q)
-    # index to search by (artist_name) or by (artist_name,title) or by (artist_name,title,release)
-    q = "CREATE INDEX idx_artist_name ON songs ('artist_name','title','release')"
-    if verbose > 0: print q
+    # index to search by (artist_name)
+    # or by (artist_name,title) or by (artist_name,title,release)
+    q = "CREATE INDEX idx_artist_name ON songs "
+    q += "('artist_name','title','release')"
+    if verbose > 0:
+        print q
     c.execute(q)
-    # index to search by (title) or by (title,artist_name) or by (title,artist_name,release)
+    # index to search by (title)
+    # or by (title,artist_name) or by (title,artist_name,release)
     q = "CREATE INDEX idx_title ON songs ('title','artist_name','release')"
-    if verbose > 0: print q
+    if verbose > 0:
+        print q
     c.execute(q)
-    # index to search by (release) or by (release,artist_name) or by (release,artist_name,title)
+    # index to search by (release)
+    # or by (release,artist_name) or by (release,artist_name,title)
     q = "CREATE INDEX idx_release ON songs ('release','artist_name','title')"
-    if verbose > 0: print q
-    # index to search by (duration) or by (duration,artist_id)
+    if verbose > 0:
+        print q
+    # index to search by (duration)
+    # or by (duration,artist_id)
     q = "CREATE INDEX idx_duration ON songs ('duration','artist_id')"
-    if verbose > 0: print q
+    if verbose > 0:
+        print q
     c.execute(q)
-    # index to search by (year) or by (year,artist_id) or by (year,artist_id,title)
+    # index to search by (year)
+    # or by (year,artist_id) or by (year,artist_id,title)
     q = "CREATE INDEX idx_year ON songs ('year','artist_id','title')"
-    if verbose > 0: print q
+    if verbose > 0:
+        print q
     c.execute(q)
     # index to search by (year) or by (year,artist_name)
     q = "CREATE INDEX idx_year2 ON songs ('year','artist_name')"
-    if verbose > 0: print q
+    if verbose > 0:
+        print q
     c.execute(q)
     # index to search by (shs_work)
     q = "CREATE INDEX idx_shs_work ON songs ('shs_work')"
-    if verbose > 0: print q
+    if verbose > 0:
+        print q
     c.execute(q)
     # index to search by (shs_perf)
     q = "CREATE INDEX idx_shs_perf ON songs ('shs_perf')"
-    if verbose > 0: print q
+    if verbose > 0:
+        print q
     c.execute(q)
     # done, commit
     conn.commit()
@@ -187,12 +212,13 @@ def die_with_usage():
     """ HELP MENU """
     print 'Command to create the track_metadata SQLite database'
     print 'to launch (it might take a while!):'
-    print '   python create_track_metadata_db.py [FLAGS] <MillionSong dir> <track_metadata.db>'
+    print '   python create_track_metadata_db.py [FLAGS] <MSD dir> <tmdb>'
     print 'PARAMS'
-    print '  MillionSong dir   - directory containing .h5 song files in sub dirs'
-    print '  track_metadata.db - filename for the database'
+    print '   MSD dir   - directory containing .h5 song files in sub dirs'
+    print '        tmdb - filename for the database (track_metadata.db)'
     print 'FLAGS'
-    print '  -shsdata f  - file containing the SHS dataset (concatenate train and test)'
+    print '  -shsdata f  - file containing the SHS dataset'
+    print '                (you can simply concatenate train and test)'
     print '  -verbose    - print every query'
     sys.exit(0)
 
@@ -205,12 +231,13 @@ def die_with_usage():
 
     # import HDF5 stuff
     # yes, it is worth of a WTF like this last one:
-    # http://thedailywtf.com/Articles/CompareObjectAsIAlertDocumentOrNullIfNotCastable-and-More.aspx
+    # http://thedailywtf.com/
+    #   Articles/CompareObjectAsIAlertDocumentOrNullIfNotCastable-and-More.aspx
     # but I plan to buy some bad code offsets anyway
     # http://codeoffsets.com/
-    pythonsrc = os.path.join(sys.argv[0],'../../../PythonSrc')
-    pythonsrc = os.path.abspath( pythonsrc )
-    sys.path.append( pythonsrc )
+    pythonsrc = os.path.join(sys.argv[0], '../../../PythonSrc')
+    pythonsrc = os.path.abspath(pythonsrc)
+    sys.path.append(pythonsrc)
     from hdf5_getters import *
 
     verbose = 0
@@ -247,21 +274,21 @@ def die_with_usage():
 
     # open connection
     conn = sqlite3.connect(dbfile)
-    
+
     # iterate HDF5 files
     cnt_files = 0
     for root, dirs, files in os.walk(maindir):
-        files = glob.glob(os.path.join(root,'*.h5'))
-        for f in files :
-            fill_from_h5(conn,f,verbose=verbose)
+        files = glob.glob(os.path.join(root, '*.h5'))
+        for f in files:
+            fill_from_h5(conn, f, verbose=verbose)
             cnt_files += 1
-            if cnt_files % 50 == 0:
-                conn.commit() # we commit only every 50 files!
+            if cnt_files % 200 == 0:
+                conn.commit() # we commit only every 200 files!
     conn.commit()
     t2 = time.time()
-    stimelength = str(datetime.timedelta(seconds=t2-t1))
-    print 'added the content of',cnt_files,'files to database:',dbfile
-    print 'it took:',stimelength
+    stimelength = str(datetime.timedelta(seconds=t2 - t1))
+    print 'added the content of', cnt_files, 'files to database:', dbfile
+    print 'it took:', stimelength
 
     # add SHS data
     if shsdataset != '':
@@ -275,32 +302,36 @@ def die_with_usage():
                 continue
             # work
             if line[0] == '%':
-                works = map(lambda w: int(w), line[1:].split(' ')[0].split(',')[:-1])
+                works = map(lambda w: int(w),
+                            line[1:].split(' ')[0].split(',')[:-1])
                 work = min(works)
                 continue
             # regular line
-            tid,aid,perf = line.strip().split('<SEP>')
+            tid, aid, perf = line.strip().split('<SEP>')
             q = "UPDATE songs SET shs_perf=" + perf + ", shs_work=" + str(work)
             q += " WHERE track_id='" + tid + "'"
-            if verbose > 0: print q
+            if verbose > 0:
+                print q
             conn.execute(q)
         # iteration done
         shs.close()
         conn.commit()
 
-
-
     # add indices
     c = conn.cursor()
     res = c.execute('SELECT Count(*) FROM songs')
     nrows_before = res.fetchall()[0][0]
-    add_indices_to_db(conn,verbose=verbose)
+    add_indices_to_db(conn, verbose=verbose)
     res = c.execute('SELECT Count(*) FROM songs')
     nrows_after = res.fetchall()[0][0]
     c.close()
-    assert nrows_before == nrows_after,'you lost rows during indexing???' # sanity check
+    # sanity check
+    assert nrows_before == nrows_after, 'Lost rows during indexing?'
     if nrows_before != 1000000:
-        print 'we got',nrows_before,'rows, this is not the full MillionSongDataset, just checking...'
+        print '*********************************************************'
+        print 'We got', nrows_before, 'rows.'
+        print 'This is not the full MillionSongDataset! just checking...'
+        print '*********************************************************'
 
     # close connection
     conn.close()
@@ -309,7 +340,6 @@ def die_with_usage():
     t3 = time.time()
 
     # DONE
-    print 'done! (indices included) database:',dbfile
-    stimelength = str(datetime.timedelta(seconds=t3-t1))
+    print 'done! (indices included) database:', dbfile
+    stimelength = str(datetime.timedelta(seconds=t3 - t1))
     print 'execution time:', stimelength
-