In [5]:
import numpy as np
import pickle as pickle
import scipy.sparse as sparse
import pandas as pd

from IPython import get_ipython
get_ipython().magic('reset -sf')

import numpy as np
import pandas as pd
import os
import scipy.sparse as sparse
import pickle as pickle
import sqlite3

data_file = 'ml-100k/u.data'
movie_file = 'ml-100k/u.item'


In [6]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id, 'count']].groupby(id)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp, min_uc, min_sc):
    songcount = get_count(tp, 'sid')
    tp = tp[tp['sid'].isin(songcount.index[songcount >= min_sc])]
    
    usercount = get_count(tp, 'uid')
    tp = tp[tp['uid'].isin(usercount.index[usercount >= min_uc])]
    
    usercount, songcount = get_count(tp, 'uid'), get_count(tp, 'sid') 
    return tp, usercount, songcount

def make_csr(tp,shape,row_index,col_index):
    row,col = (np.array(tp[row_index]),np.array(tp[col_index]))
    data = np.array(tp['count'])
    return sparse.csr_matrix((data,(row,col)), shape=shape)

In [7]:
def process(seed,
            min_user_count, min_song_count,
            U=None, I=None):

    saved_args = locals()
    np.random.seed(seed)
    
    ###########################
    ###########################
    #%% TASTE PROFILE DATASET
    ###########################
    ###########################
     
    
    tp_original = pd.read_table(data_file, sep='\t')
    tp_original.columns = ['uid', 'sid','count','time']

    tp = tp_original.copy()
    tp['count'] = 2*tp['count']                                 # Pk il fait *2 ?
    tp['count'] = tp['count'].astype(int)
    
    tp, usercount, songcount = filter_triplets(tp, min_user_count,min_song_count)
    
    #############################
    #%% SELECT SUBSET USER
    if U is not None:
        unique_user = usercount.index
        p_users = usercount / usercount.sum()
        select_user = np.random.choice(unique_user, size=U, replace=False, p=p_users.tolist())
        select_user = pd.DataFrame(select_user,columns=['uid'])
        tp = tp.merge(select_user,on='uid')
    
    if I is not None:
        unique_song = songcount.index
        p_songs = songcount / songcount.sum()
        select_song = np.random.choice(unique_song, size=I, replace=False, p=p_songs.tolist())
        select_song = pd.DataFrame(select_song,columns=['sid'])
        tp = tp.merge(select_song,on='sid')
    
    if U is not None or I is not None:
        tp, usercount, songcount = filter_triplets(tp, min_user_count,min_song_count)
    
    #########################
    #########################
    #%% CREATE MATRICES
    #########################
    #########################
    
    unique_user = tp.uid.unique()
    U = len(unique_user)
    user = pd.DataFrame({'uid':unique_user,'user_index': range(U)})
    
    unique_song = tp.sid.unique()
    I = len(unique_song)
    song = pd.DataFrame({'sid':unique_song,'song_index': range(I)})
    
    tp = tp.merge(user,on='uid')
    tp = tp.merge(song,on='sid')
    
    Y_listen = make_csr(tp,(U,I),'user_index','song_index')
    
    #########################
    #########################
    #%% METADATA
    #########################
    #########################
    metadata_original = pd.read_table(movie_file, sep='|', header=None,  encoding="iso-8859-1")

    metadata_original['movieId'] = metadata_original[0]
    metadata_original['title'] = metadata_original[1]
    movies_metadata = song.merge(metadata_original, left_on='sid', right_on='movieId')
    
    
    #########################
    #########################
    #%% SAVE
    #########################
    #########################
    
    filename = 'ml_' + str(seed) + \
                '_U%.2e'%U + '_I%.2e'%I + \
                '_min_uc%d_sc%d' % (min_user_count,min_song_count)
    with open(filename, 'wb') as handle:
        pickle.dump({'Y_listen':Y_listen,'movie':song, 'movies_metadata':movies_metadata,
                     'user':user,'input':saved_args}, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return Y_listen, song, movies_metadata, user, saved_args

In [8]:
res = process(seed=145, min_user_count=20, min_song_count=20, U=135, I=None)

In [9]:
res

(<129x472 sparse matrix of type '<class 'numpy.int64'>'
 	with 18505 stored elements in Compressed Sparse Row format>,
       sid  song_index
 0     474           0
 1     317           1
 2     281           2
 3     486           3
 4     181           4
 ..    ...         ...
 467   307         467
 468  1101         468
 469  1014         469
 470   343         470
 471   724         471
 
 [472 rows x 2 columns],
       sid  song_index     0  \
 0     474           0   474   
 1     317           1   317   
 2     281           2   281   
 3     486           3   486   
 4     181           4   181   
 ..    ...         ...   ...   
 467   307         467   307   
 468  1101         468  1101   
 469  1014         469  1014   
 470   343         470   343   
 471   724         471   724   
 
                                                      1            2   3  \
 0    Dr. Strangelove or: How I Learned to Stop Worr...  01-Jan-1963 NaN   
 1                     In the Name of th

In [28]:
sparse.save_npz('./Y_listen.npz', res[0])