In [249]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

class DataReaderRecSys2018:
    
    separator = '\t'
    dtype_int = np.int32
    dtype_str = str
    dtype_bool = np.bool_
    interactions_file = 'interactions.csv' 
    tracks_file = 'tracks.csv' 
    playlists_file = 'playlists.csv'
    
    def __init__(self, evaluation = True, path_csv = '', verbose = True):
        if verbose: print('IMPORTANT: all the returned matrices are in CSR format!!!')
        self.evaluation = evaluation
        self.path_csv = path_csv
    
    def getURM(self, keep_in_memory = False, verbose = True):
        # if matrix already in memory return it
        if hasattr(self, 'urm'):
            if verbose: print('ICM retreived from memory\t(%dx%d)'%(self.urm.shape))
            return self.urm
        # read file
        try: 
            df = pd.read_csv(filepath_or_buffer=self.path_csv+self.interactions_file,sep=self.separator,header=0,
                    usecols=['pid','tid','pos'],
                    dtype={'pid':self.dtype_int,'tid':self.dtype_int,'pos':self.dtype_int})
            if verbose: print('DF interactions readed')
        except:
            raise FileNotFoundError('Error reading interactions file, check path')
        # collect data to build urm
        playlists = df['pid'].values
        tracks = df['tid'].values
        assert(playlists.size == tracks.size)
        n_interactions = tracks.size
        n_playlists = playlists.max()+1 #index starts from 0
        n_tracks = tracks.max()+1 #index starts from 0
        # create urm
        urm = sp.csr_matrix((np.ones(n_interactions), (playlists,tracks)), shape=(n_playlists, n_tracks), dtype=self.dtype_int)
        if verbose: print('URM created (%dx%d) - %d interactions'%(n_playlists,n_tracks,n_interactions))
        # save in memory if needed
        if keep_in_memory:
            self.urm =  urm
            if verbose: print('URM saved in memory')
        return urm
    
    
    def getICM(self, arid = False, alid = False, keep_in_memory = False, verbose = True):
        # if no feature selected
        if not arid and not alid: 
            raise ValueError('ERROR: no feature selected in ICM!!!')
        # if matrix already in memory return it
        if hasattr(self,'icm') and arid==self.flag_icm_arid and alid == self.flag_icm_alid:
            if verbose: print('ICM retreived from memory\t(%dx%d)'%(self.icm.shape))
            return self.icm
        # read file
        try: 
            df = pd.read_csv(filepath_or_buffer=self.path_csv+self.tracks_file,sep=self.separator,header=0,
                    usecols=['tid','arid','alid'],
                    dtype={'tid':self.dtype_int,'arid':self.dtype_int,'alid':self.dtype_int})
            if verbose: print('DF tracks readed')
        except:
            raise FileNotFoundError('Error reading tracks file, check path')
        # start building icm
        n_tracks = df['tid'].max() + 1 #index starts from 0
        tracks = df['tid'].values
        if arid:
            artists = df['arid'].values
            n_artists = artists.max() + 1 #index starts from 0
            # create partial icm artists
            icm_ar = sp.csr_matrix((np.ones(n_tracks), (tracks,artists)), shape=(n_tracks,n_artists), dtype=self.dtype_int)
            if verbose: print('ICM artists created \t(%dx%d)'%(icm_ar.shape))
        if alid:
            albums = df['alid'].values
            n_albums = albums.max() + 1 #index starts from 0
            # create partial icm artists
            icm_al = sp.csr_matrix((np.ones(n_tracks), (tracks,albums)), shape=(n_tracks,n_albums), dtype=self.dtype_int)
            if verbose: print('ICM albums created \t(%dx%d)'%(icm_al.shape))
        if arid and alid:
            icm = sp.hstack([icm_ar,icm_al])
            if verbose: print('ICM total created \t(%dx%d)'%(icm.shape))
        elif arid: icm = icm_ar
        elif alid: icm = icm_al
        #save in memory if needed
        if keep_in_memory:
            self.icm = icm
            self.flag_icm_arid = arid
            self.flag_icm_alid = alid
            if verbose: print('ICM saved in memory')
        return icm
         
    def getTracksInfo(self, name = False, duration = False, keep_in_memory = False, verbose = True):
        # if info already in memory return it
        if hasattr(self, 'tracks_info'):
            if verbose: print('Tracks info retreived from memory')
            return self.tracks_info
        # read file
        try: 
            df = pd.read_csv(filepath_or_buffer=self.path_csv+self.tracks_file,sep=self.separator,header=0,
                    usecols=['tid','track_name','duration_ms'],
                    dtype={'tid':self.dtype_int,'track_name':self.dtype_str,'duration_ms':self.dtype_int})
            if verbose: print('DF tracks readed')
        except:
            raise FileNotFoundError('Error reading tracks file, check path')
        # building info
        t_info = [df['tid'].values]
        if name: t_info.append(df['track_name'].values)
        if duration: t_info.append(df['duration_ms'].values)
        t_info = np.array(t_info).T
        # save in memory if needed
        if keep_in_memory:
            self.tracks_info = t_info
            if verbose: print('Tracks info saved in memory')
        return t_info
          
    def getPlaylistsInfo(self, name = False, description = False, num_followers = False,collaborative = False,
                         num_albums = False, num_artists = False,modified_at = False, num_edits = False, duration_ms = False,
                         keep_in_memory = False, verbose = True):
        # if info already in memory return it
        if hasattr(self, 'df_playlists_info'):
            if verbose: print('Playlists info retreived from memory')
            df = self.df_playlists_info
        
        else:
            # read file
            #name	collaborative	pid	modified_at	
            #num_albums	num_tracks	num_followers	num_edits	duration_ms	num_artists	description
            try: 
                df = pd.read_csv(filepath_or_buffer=self.path_csv+self.playlists_file,sep=self.separator,header=0,
                        usecols=['pid','name','description','num_followers','collaborative',
                                 'num_albums','num_artists','modified_at','num_edits','duration_ms'],
                        dtype={'pid':self.dtype_int,'name':self.dtype_str,'description':self.dtype_str,
                               'num_followers':self.dtype_int,'collaborative':self.dtype_bool,'num_albums':self.dtype_int,
                               'num_artists':self.dtype_int,'modified_at':self.dtype_int,'num_edits':self.dtype_int,
                               'duration_ms':self.dtype_int})
                if verbose: print('DF playlsits readed')
            except:
                raise FileNotFoundError('Error reading playlists file, check path')
        # building info
        i = 0
        p_info = [df['pid'].values]; order = str(i)+'-pid'; i+=1
        if name: p_info.append(df['name'].values); order += ', ' + str(i)+'-name'; i+=1
        if description: p_info.append(df['description'].values); order += ', ' + str(i)+'-description'; i+=1
        if num_followers: p_info.append(df['num_followers'].values); order += ', ' + str(i)+'-num_followers'; i+=1 
        if collaborative: p_info.append(df['collaborative'].values); order += ', ' + str(i)+'-collaborative'; i+=1
        if num_albums: p_info.append(df['num_albums'].values); order += ', ' + str(i)+'-albums'; i+=1
        if num_artists: p_info.append(df['num_artists'].values); order += ', ' + str(i)+'-num_artists'; i+=1
        if modified_at: p_info.append(df['modified_at'].values); order += ', ' + str(i)+'-modified_at'; i+=1 
        if num_edits: p_info.append(df['num_edits'].values); order += ', ' + str(i)+'-num_edits'; i+=1
        if duration_ms: p_info.append(df['duration_ms'].values); order += ', ' + str(i)+'-duration_ms'; i+=1 
        p_info = np.array(p_info).T
        if verbose: print('Index order: '+ order)
        if keep_in_memory and not hasattr(self, 'df_playlists_info'):
            self.df_playlists_info = df
            if verbose: print('Playlists info saved in memory')
        return p_info
    
    def freeMemoryURM(self):
        del(self.urm)
    
    def freeMemoryICM(self):
        del(self.icm)
    
    def freeMemoryICM(self):
        del(self.tracks_info)
    
    def getArtistInfo(self):
        pass
    
    def getAlbumsInfo(self):
        pass
    
    
    
    def info(self):
        print('!!! all the returned matrices are in CSR format !!!')
        print('!!! URM, playlist contains dupilcate tracks (so cell value != 1), adjust this value in preprocessing !!!')

In [250]:
dr = DataReaderRecSys2018(evaluation=True, path_csv = '', verbose= 'True')

IMPORTANT: all the returned matrices are in CSR format!!!


In [247]:
p_info = dr.getPlaylistsInfo(name = True, description = True, keep_in_memory= True)

DF playlsits readed
Index order: 0-pid, 1-name, 2-description
Playlists info saved in memory


In [248]:
p_info[0]

array([0, 'Throwbacks', nan], dtype=object)

In [47]:
t_info = dr.getTracksInfo(name = True, duration= True, keep_in_memory=True)

DF tracks readed
Tracks info saved in memory


In [68]:
icm = dr.getICM(arid=True,alid=True, keep_in_memory= True)

DF tracks readed
ICM artists created 	(2262292x295860)
ICM albums created 	(2262292x734684)
ICM total created 	(2262292x1030544)
ICM kept in memory


In [4]:
df_playlists = pd.read_csv("playlists.csv", delimiter='\t')
df_playlists.drop(['num_tracks.1'], axis=1, inplace=True)
df_playlists.to_csv("playlists2.csv", sep='\t', index=False)