In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_threholds_and_sizes(tracks, number_tracks, parts=[1, 0]):
    threholds = []
    sizes = []
    n = len(parts) - 1
    for i in range(n):
        t1 = round(tracks.popular.quantile(q=parts[i]))
        t2 = round(tracks.popular.quantile(q=parts[i+1]))
        if i == n - 1:
            t2 = 0
        size = round(number_tracks*(parts[i] - parts[i+1]))
        sizes.append(size)
        threholds.append((t1, t2))
    return threholds, sizes

In [3]:
def get_tracks_ids(tracks, sizes, threholds):
    ids = np.array([])
    num_tracks = np.sum(sizes)
    flag = False
    for thr,size in zip(threholds, sizes):
        start_ids = tracks[(tracks.popular <= thr[0]) & 
                           (tracks.popular > thr[1])].trackid.values
        if flag:
            add_ids = np.random.choice(start_ids, size=size, replace=False)
        else:
            add_ids = tracks[(tracks.popular <= thr[0]) & 
                             (tracks.popular > thr[1])].sort_values(by='popular', 
                                                                    ascending=False).trackid.values[:size]
            flag = True
        ids = np.hstack((ids, add_ids))
    return ids

In [4]:
def get_playlists_ids(playlists, size):
    return np.random.choice(playlists['pid'].values, size, replace=False)

In [5]:
def sample_ids(tracks, playlists, transactions, 
               number_tracks, number_playlists, 
               parts=[1, 0], seed=0):
    np.random.seed(seed)
    playlists_ids = get_playlists_ids(playlists, number_playlists)
    start_tracks_ids = transactions.query('pid in @playlists_ids').trackid.unique()
    start_tracks = tracks.query('trackid in @start_tracks_ids')
    thrs, sizes = get_threholds_and_sizes(start_tracks, number_tracks, parts)
    print (thrs)
    tracks_ids = get_tracks_ids(tracks, sizes, thrs)
    return playlists_ids, tracks_ids

In [6]:
def get_data_by_ids(tracks, playlists, transactions, artists, albums, p_ids, t_ids):
    new_tracks = tracks.query('trackid in @t_ids')
    artist_ids = new_tracks.artistid.unique()
    new_artists = artists.query('artistid in @artist_ids')
    album_ids = new_tracks.albumid.unique()
    new_albums = albums.query('albumid in @album_ids')
    new_playlists = playlists.query('pid in @p_ids')
    new_transactions = transactions.query('pid in @p_ids')
    print (new_transactions.shape)
    new_transactions = new_transactions.query('trackid in @t_ids')
    print (new_transactions.shape)
    return [new_tracks, new_playlists, new_transactions, new_artists, new_albums]

In [7]:
data_dir = '/home/vadim/playlist_generation/data'

In [8]:
tracks = pd.read_csv('{}/tracks.csv'.format(data_dir), index_col=0)
artists = pd.read_csv('{}/artists.csv'.format(data_dir), index_col=0)
albums = pd.read_csv('{}/albums.csv'.format(data_dir), index_col=0)
transactions = pd.read_csv('{}/transactions.csv'.format(data_dir), index_col=0)
playlists = pd.read_csv('{}/playlists.csv'.format(data_dir), index_col=0)

  mask |= (ar1 == a)


In [9]:
seed = 0

In [10]:
p_ids, t_ids = sample_ids(tracks, playlists, transactions, 
           number_tracks=22000, number_playlists=10000, 
           parts=[1, 0.90, 0.8, 0.50, 0], seed=seed)

[(13977, 170), (170, 73), (73, 15), (15, 0)]


In [11]:
datas = get_data_by_ids(tracks, playlists, transactions, artists, albums, p_ids, t_ids)

(654202, 2)
(215579, 2)


In [12]:
grouped = datas[2].groupby('pid')

In [13]:
pids = grouped.count()[grouped.count()['trackid'] >= 5].index.values

In [14]:
datas[2] = datas[2].query('pid in @pids')

In [15]:
datas[2].shape

(210950, 2)

In [16]:
datas[2].pid.unique().shape, datas[2].trackid.unique().shape

((7657,), (8560,))

In [17]:
datas[2].shape

(210950, 2)

In [18]:
names = ['tracks', 'playlists', 'transactions', 'artists', 'albums']
for df, name in zip(datas, names):
    df.to_csv('{}/random_data/{}.csv'.format(data_dir, name))