# Requirements

In [2]:
import random
import numpy as np
import pandas as pd
from datetime import datetime

# Functions

## transactional data

In [3]:
def create_df_modeling(start_date, end_date,
                       low_user, high_user, dau,
                       low_track, high_track, 
                       min_plays, max_plays,
                       min_n_tracks, max_n_tracks
                      ):
    # creates a range of dates
    date_range = pd.date_range(start_date, end_date)
    # creates empty array for the final user array
    user_array_acum = []
    # creates empty array for the final id_date array
    date_array_acum = []
    # creates empty array for the final track array
    tracks_array_acum = []
    # creates empty array for the final plays array
    plays_array_acum = []
    # minimum number of random listeners per day (dau)
    length_user = 1 + int((high_user-low_user) * dau)
    # for loop to random create transactional data
    for current_date in date_range:
        #
        user_array = np.random.randint(low_user, high_user, size = length_user)
        user_array_col = np.reshape(user_array, (len(user_array), 1))
        user_array_acum.append(user_array_col)
        #
        date_array = current_date.strftime("%Y-%m-%d")
        date_array = np.full(len(user_array), current_date)
        date_array_col = np.reshape(date_array, (len(date_array), 1))
        date_array_acum.append(date_array_col)
    # for loop that ranges from the min id_user to the max id_user
    for k in range(0,len(np.concatenate(user_array_acum))):
        # defines randomly the maximum range of tracks a listener can listen in 1 single day
        length_size = random.randint(min_n_tracks, max_n_tracks)
        # selects at random an array of id_track for the specific listener
        tracks_array = np.random.randint(low_track, high_track, size = length_size)
        tracks_array_acum.append(tracks_array)
        # select at random the average of plays given by each user on a giving day
        plays_array = np.random.randint(min_plays, max_plays, size = length_size)
        plays_array_acum.append(plays_array) 
    # concatenates the arrays of date, id_user and plays on the final dataframe
    date_array_final = np.concatenate(date_array_acum, axis = 0)
    user_array_final = np.concatenate(user_array_acum, axis = 0)
    # creates the final transactional dataframe with id_users and number of plays for each day
    df = pd.DataFrame({'id_date':date_array_final.reshape(len(date_array_final)),
                       'user_id':user_array_final.reshape(len(user_array_final)),
                       'id_tracks': tracks_array_acum,
                       'plays':plays_array_acum
                      })
    # explodes the final dataframe to create a vertical transactional table
    df = df.explode(['id_tracks','plays']).reset_index(drop=True)
    #
    return df

## Dimensional content

In [4]:
def create_dim_content(n_tracks, min_tracks, max_tracks, n_artists, n_genres):
    # creates an initial list of id_tracks
    id_track_list = [i for i in range(1,n_tracks+1)]
    # creates an initial list of id_artists
    id_artist_list = [i for i in range(1, n_artists+1)]
    # creates an initial list of id_genres
    id_genres = [i for i in range(1,n_genres)]
    # creates array with the specific size of tracks by artists to be distributed amongst all artists
    artist_array_sizes = np.random.randint(min_tracks, max_tracks, size = n_artists).tolist()
    # Create a copy of the list
    list_copy = id_track_list.copy()
    # initiates an empty array to store results of for loop append (arrays of id_tracks)
    sampled_arrays_acum = []
    # initiates an empty array to store results of for loop append (arrays of id_genre)
    sampled_arrays_genre_acum = []
    # for loop to interate and create the dimensional data for id_artists and id_tracks
    for size in artist_array_sizes:
        if size <= len(list_copy):
            sampled_arrays = random.sample(list_copy, size)
            sampled_arrays_genre = random.choices(id_genres, k = size)
            sampled_arrays_acum.append(sampled_arrays)
            sampled_arrays_genre_acum.append(sampled_arrays_genre)
            list_copy = [item for item in list_copy if item not in sampled_arrays]
    # creates the final dimensional dataframe
    df = pd.DataFrame({'id_artist':id_artist_list,
                       'id_tracks':sampled_arrays_acum,
                       'id_genre':sampled_arrays_genre_acum
                      })
    # explodes the final dataframe to create a vertical dimensional table
    df = df.explode(['id_tracks','id_genre']).reset_index(drop = True)
    #
    return df

## Features for artists

In [5]:
def creates_features(n_features,n_artists,std_max, min_mean, max_mean):
    # creates an array with standard deviation of a uniform distribution
    std_array = [random.uniform(0, std_max) for i in range(0,n_features)]
    # creates an array with mean values of a uniform distribution
    mean_array = [random.randint(min_mean, max_mean) for i in range(0,n_features)]
    # creates an empty array with the number of rows as the same of the length of the quantity of artists
    result = np.empty((n_artists, 0))
    for i in range(0,n_features):
        feature_i = np.random.normal(mean_array[i], std_array[i] * mean_array[i], n_artists)
        feature_i = feature_i.astype(int)
        result = np.column_stack((result, feature_i))
        result = result.astype(int)
    # creates the list of id_artist
    id_artist = [i for i in range(1,n_artists+1)]
    # fills values on the result array
    result = np.column_stack((id_artist,result))
    # creates a list with only one element
    artist_col_names = ['id_artist']
    # creates a list with the names of the features that will be used on the final dataframe
    feat_col_names = ['Feature'+str(i) for i in range(1,n_features+1)]
    # adds two lists (elements will be use as the header of the final dataframe)
    col_names = artist_col_names + feat_col_names
    # final dataframe with features by artist
    df = pd.DataFrame(result, columns = col_names)
    return df

# Pipeline

## Defining global parameters

In [6]:
########################################################################################################
######################### input parameters for the create_df_modeling function #########################
# Define the range of dates for the trasactional dataset
start_date = datetime(2023, 1, 1).strftime('%Y-%m-%d')
end_date = datetime(2023, 1, 31).strftime('%Y-%m-%d')
# defines the minimum id_user to consider
low_user = 1
# defines the maximum id_user to consider
high_user = 50
# defines the daily audience (the percentage of the sample size of users) for each id_date
dau = 0.1
# defines the minimum id_track available
low_track = 1
# defines the maximum id_track available
high_track = 4000
# defines the minimum number of plays a listener can hit play every day
min_plays = 1
# defines the maximum number of plays a listener can hit play every day
max_plays = 20
# defines the minimum number of tracks a listener can listen to every day
min_n_tracks = 2
# defines the maximum number of tracks a listener can listen to every day
max_n_tracks = 5


########################################################################################################
################ input parameters for the create_dim_content function ##################################
# defines the total number of tracks
n_tracks = high_track
# defines the total number of id_artist
n_artists = 500
# defines the total number of genres
n_genres = 5
# defines the minimum number of tracks by id_artist
min_tracks = 2 * round((n_tracks/n_artists),0) * 0.2
# defines the maximum number of tracks by id_artist
max_tracks = 2 * round((n_tracks/n_artists),0) * 0.8


########################################################################################################
################ input parameters for the creates_features function ##################################
# number of features that are going to appear on the final dataframe
n_features = 10
# maximum standard deviation value for each of the feature
std_max = 1
# minimum value for the mean value for each of the features
min_mean = 100
# maximum value for the mean value for each of the features
max_mean = 1500

## Creates df transactional data

In [7]:
#
df = create_df_modeling(start_date, end_date,
                        low_user, high_user, dau,
                        low_track, high_track, 
                        min_plays, max_plays,
                        min_n_tracks, max_n_tracks
                       )

## Creates df dimensional content

In [8]:
dc = create_dim_content(n_tracks,min_tracks,max_tracks,n_artists, n_genres)

## Creates df with features by artist

In [9]:
df_feat = creates_features(n_features,n_artists,std_max, min_mean, max_mean)

# Artefacts

## Transactional data source (reference modeling dataframe)

In [10]:
df

Unnamed: 0,id_date,user_id,id_tracks,plays
0,2023-01-01,26,2194,15
1,2023-01-01,26,2897,3
2,2023-01-01,26,3097,11
3,2023-01-01,26,829,18
4,2023-01-01,26,3319,11
...,...,...,...,...
560,2023-01-31,42,2239,6
561,2023-01-31,42,2474,17
562,2023-01-31,42,1301,19
563,2023-01-31,15,1117,18


## Dimensional data source with musical genre for each track

In [11]:
dc

Unnamed: 0,id_artist,id_tracks,id_genre
0,1,941,1
1,1,892,3
2,1,2008,3
3,1,2298,3
4,2,2968,1
...,...,...,...
3597,500,3407,4
3598,500,1560,1
3599,500,2290,4
3600,500,3549,1


## Data source with feature values for each artist

In [12]:
df_feat

Unnamed: 0,id_artist,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,Feature10
0,1,4,394,113,578,572,394,1099,1263,1456,-50
1,2,116,298,199,60,679,247,1108,1307,2131,791
2,3,10,232,228,1044,296,625,1206,1313,159,293
3,4,179,457,200,279,227,309,1190,1274,555,1522
4,5,145,320,130,409,220,261,1174,1224,493,806
...,...,...,...,...,...,...,...,...,...,...,...
495,496,74,28,280,1190,86,576,1187,1294,1303,657
496,497,114,416,214,1217,785,179,1156,1232,15,685
497,498,188,324,321,1547,1017,457,1202,1257,609,482
498,499,85,240,291,768,1240,420,1206,1332,-105,535


In [16]:
df.to_csv('./data/df_transactional.csv', index=False)

In [17]:
dc.to_csv('./data/df_dimensional.csv', index=False)

In [18]:
df_feat.to_csv('./data/df_artist_features.csv', index=False)