In [811]:
import time
import hdf5_getters

from ml_pipeline import *
from model_common import *

# Code for processing Million Song dataset follows

In [None]:
nsamples_list = [5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 20000, 30000, 40000,50000, 75000, 100000]

class CollectH5Files(MLRaw):
    def __init__(self):
        super(CollectH5Files, self).__init__()
        self.class_name = "CollectH5Files"
        self.mime_type = 'application/x-hdf'
        self.nsongs = None
        self.max_process = None
        self.h5_files = []
        
        input_data = {"final_loc" : ""}
        input_json = json.dumps(input_data)
        output_data = {"h5_files" : ""}
        output_json = json.dumps(output_data)
        
        self.istr_jsons = input_json
        self.ostr_jsons = output_json
        
    def do_walk(self, filepath, input_data):
        
        input_data["count"] += 1
        if get_file_type(filepath, mime=True) != self.mime_type:
            return {"stop" : False}
        
        # Uncomment this if you want a subset
        if len(self.h5_files) >= self.nsongs:
            return {"stop" : True}
        
        if input_data["count"] > self.max_process:
            return {"stop" : True}
        
        h5 = hdf5_getters.open_h5_file_read(filepath)
        song_year = int(hdf5_getters.get_year(h5).item())
        h5.close()
        
        if song_year is None or song_year == '' or song_year == 0 or song_year < 1800 or song_year > 2100:
            return {"stop" : False}
        
        self.h5_files.append(filepath)
        
        return {"stop" : False}
            
    def do_run(self, input_data, traversal):
        if traversal == "POST":
            if not "h5_files" in input_data.keys():
                input_data["h5_files"] = ""
            return input_data
            
        self.final_loc = input_data['final_loc']
        
        input_data["count"] = 0
        self.walk_files(self.final_loc, input_data)
        del input_data["count"]
        
        input_data["final_loc"] = self.final_loc
        input_data['h5_files'] = self.h5_files
        
        return input_data
        
class ExtractTrackData(MLDerive):
    def __init__(self):
        super(ExtractTrackData, self).__init__()
        self.class_name = "ExtractTrackData"
        self.mime_type = "application/x-hdf"
        self.h5_files = None
        self.nsongs = None
        self.train_frac = None
        
        input_data = {"final_loc" : "", "h5_files" : ""}
        input_json = json.dumps(input_data)
        output_data = {"Xtrain" : "", "ytrain" : "", "Xtest" : "", "ytest" : ""}
        output_json = json.dumps(output_data)
        
        self.istr_jsons = input_json
        self.ostr_jsons = output_json
        
    def get_track_data(self, filepath):
        h5 = hdf5_getters.open_h5_file_read(filepath)
        keys = filter(lambda x: x[:3] == 'get',hdf5_getters.__dict__.keys())
        track_data = {}
        track_data['year'] = hdf5_getters.get_year(h5).item()
        track_data['danceability'] = hdf5_getters.get_danceability(h5).item()
        track_data['loudness'] = hdf5_getters.get_loudness(h5).item()
        track_data['track_7digitalid'] = hdf5_getters.get_track_7digitalid(h5).item()
        track_data['energy'] = hdf5_getters.get_energy(h5).item()
        track_data['tempo'] = hdf5_getters.get_tempo(h5).item()
        track_data['end_fade_in'] = hdf5_getters.get_end_of_fade_in(h5).item()
        track_data['start_fade_out'] = hdf5_getters.get_start_of_fade_out(h5).item()
        h5.close()
        
        return track_data
            
    def do_run(self, input_data, traversal):
        if traversal == "POST":
            if not "Xtrain" in input_data.keys():
                input_data["Xtrain"] = ""
            if not "ytrain" in input_data.keys():
                input_data["ytrain"] = ""
            if not "Xtest" in input_data.keys():
                input_data["Xtest"] = ""
            if not "ytest" in input_data.keys():
                input_data["ytest"] = ""
            return input_data
            
        self.h5_files = input_data['h5_files']
        
        self.nsongs = input_data['nsongs'] = len(self.h5_files)
    
        X, y = self.extract_X_y(self.get_track_data, input_data, self.h5_files)
        
        # y is returned as a 2D array to support multilabel prediction. In our case we only have 1 label per example
        # so reshape into 1D array
        y = y.reshape((y.shape[0]))
        
        # We are interested in classifyings into pre-2000 and post-2000 songs
        dprint(DPRINT_DEBUG, "Before transform: y.shape=" + str(y.shape))
        y = (y > 2000)
        dprint(DPRINT_DEBUG, "After transform: y.shape=" + str(y.shape))
        train_limit = int(self.train_frac * self.nsongs)
        
        # Let the parent add the useless "features" key
        input_data["Xtrain"], input_data["ytrain"] = X[0:train_limit], y[0:train_limit]
        input_data["Xtest"], input_data["ytest"] = X[train_limit:], y[train_limit:]
        
        dprint(DPRINT_INFO, 
            "Xtrain.shape=" + str(input_data["Xtrain"].shape) + ", ytrain.shape=" + str(input_data["ytrain"].shape))
        dprint(DPRINT_INFO, 
            "Xtest.shape=" + str(input_data["Xtest"].shape) + ", ytest.shape=" + str(input_data["ytest"].shape))
        
        return input_data

# Gaussian Naive Bayes on Million Song Dataset

In [None]:
import time

start_time = time.time()

feature_list = ['danceability', 'loudness', 'energy', 'tempo', 'end_fade_in', 'start_fade_out']
label_list = ['year']
myML = MLRoot()
myML.mount(mount_spec = "./millsong_GaussNB_mounts.json")
myML.print_tree()
input_data = {}
input_data["remote_loc"] = ""
myML.compile(json.dumps(input_data))
input_data["remote_loc"] = "/MillionSong"
# Set maximum size of data we want to process: 100 GB
myML.setprop("/root/fetch", {"max_size" : 100}) 
# Maximum number of files to process: 300,000
myML.setprop("/root/fetch/raw/collect_h5", {"max_process" : 300000})
# Maximum number of examples (training+test) to collect: 150,000
myML.setprop("/root/fetch/raw/collect_h5", {"nsongs" : 150000})
myML.setprop("/root/derive/millsong_extract", {"train_frac" : 0.7})
myML.setprop("/root/derive/millsong_extract", {"X_map" : feature_list})
myML.setprop("/root/derive/millsong_extract", {"y_map" : label_list})
myML.setprop("/root/model/gauss_nb", {"nsamples_list" : nsamples_list})
myML.run(input_data)
myML.umount()
dprint(DPRINT_INFO, "Total Time taken: " + str(time.time() - start_time))

# Logistic Regression on Million Song Dataset

In [None]:
import time

start_time = time.time()

feature_list = ['danceability', 'loudness', 'energy', 'tempo', 'end_fade_in', 'start_fade_out']
label_list = ['year']
myML = MLRoot()
myML.mount(mount_spec = "./millsong_logistic_mounts.json")
myML.print_tree()
input_data = {}
input_data["remote_loc"] = ""
myML.compile(json.dumps(input_data))
input_data["remote_loc"] = "/MillionSong"
# Set maximum size of data we want to process: 100 GB
myML.setprop("/root/fetch", {"max_size" : 100}) 
# Maximum number of files to process: 300,000
myML.setprop("/root/fetch/raw/collect_h5", {"max_process" : 300000})
# Maximum number of examples (training+test) to collect: 150,000
myML.setprop("/root/fetch/raw/collect_h5", {"nsongs" : 150000})
myML.setprop("/root/derive/millsong_extract", {"train_frac" : 0.7})
myML.setprop("/root/derive/millsong_extract", {"X_map" : feature_list})
myML.setprop("/root/derive/millsong_extract", {"y_map" : label_list})
myML.setprop("/root/model/logistic", {"nsamples_list" : nsamples_list})
myML.run(input_data)
myML.umount()
dprint(DPRINT_INFO, "Total Time taken: " + str(time.time() - start_time))