In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)

# import dask
# import dask.dataframe as dd

import keggler as kg

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import gc
gc.enable()

import warnings
warnings.simplefilter(action='ignore', category=Warning)

import os, psutil

# Set up a logger to dump messages to both log file and notebook
import logging as logging
def ini_log(filename):
    logger = logging.getLogger(__name__)
    ## avoid multiple printouts due to same handlers added several times
    if not logger.handlers:
        logger.setLevel(logging.DEBUG)

        handlers = [logging.StreamHandler(None), logging.FileHandler(filename, 'a')]

        fmt=logging.Formatter('%(asctime)-15s: %(levelname)s  %(message)s')
        for h in handlers:
            h.setFormatter(fmt)
            logger.addHandler(h)
    return logger
        
log = ini_log('out.log')

PATH='data_mini/'

print(os.listdir(PATH))


['track_features', 'training_set']


# Read in training data

In [2]:
df_trn = pd.read_csv(PATH+'training_set/log_mini.csv.gz', usecols=[i for i in range(21) if i != 16])

In [3]:
df_trn.memory_usage(deep=True) / 1024**2

Index                               0.000076
session_id                         15.209770
session_position                    1.280823
session_length                      1.280823
track_id_clean                     15.209770
skip_1                              0.160103
skip_2                              0.160103
skip_3                              0.160103
not_skipped                         0.160103
context_switch                      1.280823
no_pause_before_play                1.280823
short_pause_before_play             1.280823
long_pause_before_play              1.280823
hist_user_behavior_n_seekfwd        1.280823
hist_user_behavior_n_seekback       1.280823
hist_user_behavior_is_shuffle       0.160103
hour_of_day                         1.280823
premium                             0.160103
context_type                       11.132299
hist_user_behavior_reason_start    10.292255
hist_user_behavior_reason_end      10.278235
dtype: float64

In [4]:
df_trn.dtypes

session_id                         object
session_position                    int64
session_length                      int64
track_id_clean                     object
skip_1                               bool
skip_2                               bool
skip_3                               bool
not_skipped                          bool
context_switch                      int64
no_pause_before_play                int64
short_pause_before_play             int64
long_pause_before_play              int64
hist_user_behavior_n_seekfwd        int64
hist_user_behavior_n_seekback       int64
hist_user_behavior_is_shuffle        bool
hour_of_day                         int64
premium                              bool
context_type                       object
hist_user_behavior_reason_start    object
hist_user_behavior_reason_end      object
dtype: object

In [5]:
df_trn.head(10)

Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_1,skip_2,skip_3,not_skipped,context_switch,no_pause_before_play,short_pause_before_play,long_pause_before_play,hist_user_behavior_n_seekfwd,hist_user_behavior_n_seekback,hist_user_behavior_is_shuffle,hour_of_day,premium,context_type,hist_user_behavior_reason_start,hist_user_behavior_reason_end
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,20,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,False,False,False,True,0,0,0,0,0,0,True,16,True,editorial_playlist,trackdone,trackdone
1,0_00006f66-33e5-4de7-a324-2d18e439fc1e,2,20,t_9099cd7b-c238-47b7-9381-f23f2c1d1043,False,False,False,True,0,1,0,0,0,0,True,16,True,editorial_playlist,trackdone,trackdone
2,0_00006f66-33e5-4de7-a324-2d18e439fc1e,3,20,t_fc5df5ba-5396-49a7-8b29-35d0d28249e0,False,False,False,True,0,1,0,0,0,0,True,16,True,editorial_playlist,trackdone,trackdone
3,0_00006f66-33e5-4de7-a324-2d18e439fc1e,4,20,t_23cff8d6-d874-4b20-83dc-94e450e8aa20,False,False,False,True,0,1,0,0,0,0,True,16,True,editorial_playlist,trackdone,trackdone
4,0_00006f66-33e5-4de7-a324-2d18e439fc1e,5,20,t_64f3743c-f624-46bb-a579-0f3f9a07a123,False,False,False,True,0,1,0,0,0,0,True,16,True,editorial_playlist,trackdone,trackdone
5,0_00006f66-33e5-4de7-a324-2d18e439fc1e,6,20,t_c815228b-3212-4f9e-9d4f-9cb19b248184,False,False,True,False,0,1,0,0,0,0,True,16,True,editorial_playlist,trackdone,fwdbtn
6,0_00006f66-33e5-4de7-a324-2d18e439fc1e,7,20,t_e23c19f5-4c32-4557-aa44-81372c2e3705,True,True,True,False,0,1,0,0,0,0,True,16,True,editorial_playlist,fwdbtn,fwdbtn
7,0_00006f66-33e5-4de7-a324-2d18e439fc1e,8,20,t_0be6eced-f56f-48bd-8086-f2e0b760fdee,True,True,True,False,0,1,0,0,0,0,True,16,True,editorial_playlist,fwdbtn,fwdbtn
8,0_00006f66-33e5-4de7-a324-2d18e439fc1e,9,20,t_f3ecbd3b-9e8e-4557-b8e0-39cfcd7e65dd,False,True,True,False,0,1,0,0,0,0,True,16,True,editorial_playlist,fwdbtn,fwdbtn
9,0_00006f66-33e5-4de7-a324-2d18e439fc1e,10,20,t_2af4dfa0-7df3-4b7e-b7ab-353ba48237f9,True,True,True,False,0,1,0,0,0,0,True,16,True,editorial_playlist,fwdbtn,fwdbtn


## Reduce memory footprint

In [6]:
enc = {
    'hist_user_behavior_reason_start': {'trackdone': 1, 'fwdbtn': 2, 'trackerror': 8, 'remote': 7, 'clickrow': 4, 'backbtn': 3, 'playbtn': 6, 'appload': 5, 'endplay': 9},
    'context_type': {'radio': 3, 'personalized_playlist': 4, 'charts': 6, 'user_collection': 2, 'editorial_playlist': 1, 'catalog': 5},
    'hist_user_behavior_reason_end': {'trackdone': 1, 'fwdbtn': 2, 'logout': 5, 'clickrow': 7, 'backbtn': 3, 'endplay': 4, 'remote': 6}
}

for c in enc:
    df_trn[c] = df_trn[c].map(enc[c]).fillna(0).astype(np.uint8)

In [7]:
df_trn.memory_usage(deep=True).sum()/1024**2

43.38794708251953

In [8]:
df_trn.memory_usage(deep=True)/1024**2

Index                               0.000076
session_id                         15.209770
session_position                    1.280823
session_length                      1.280823
track_id_clean                     15.209770
skip_1                              0.160103
skip_2                              0.160103
skip_3                              0.160103
not_skipped                         0.160103
context_switch                      1.280823
no_pause_before_play                1.280823
short_pause_before_play             1.280823
long_pause_before_play              1.280823
hist_user_behavior_n_seekfwd        1.280823
hist_user_behavior_n_seekback       1.280823
hist_user_behavior_is_shuffle       0.160103
hour_of_day                         1.280823
premium                             0.160103
context_type                        0.160103
hist_user_behavior_reason_start     0.160103
hist_user_behavior_reason_end       0.160103
dtype: float64

In [9]:
df_trn.dtypes

session_id                         object
session_position                    int64
session_length                      int64
track_id_clean                     object
skip_1                               bool
skip_2                               bool
skip_3                               bool
not_skipped                          bool
context_switch                      int64
no_pause_before_play                int64
short_pause_before_play             int64
long_pause_before_play              int64
hist_user_behavior_n_seekfwd        int64
hist_user_behavior_n_seekback       int64
hist_user_behavior_is_shuffle        bool
hour_of_day                         int64
premium                              bool
context_type                        uint8
hist_user_behavior_reason_start     uint8
hist_user_behavior_reason_end       uint8
dtype: object

In [10]:
# the code to generate `enc` mapping dictionary above

# enc = {}
# for c in ['context_type', 'hist_user_behavior_reason_start', 'hist_user_behavior_reason_end']:
#     enc[c] = {}
#     for i,f in enumerate(df_trn[c].unique()):
#         enc[c][f] = i+1
# print(enc)

In [11]:
df_trn = kg.reduce_mem_usage(df_trn)

Memory usage of dataframe is 43.39 MB
Memory usage after optimization is: 12.31 MB
Decreased by 71.6%


In [12]:
df_trn.dtypes

session_id                         category
session_position                       int8
session_length                         int8
track_id_clean                     category
skip_1                                uint8
skip_2                                uint8
skip_3                                uint8
not_skipped                           uint8
context_switch                         int8
no_pause_before_play                   int8
short_pause_before_play                int8
long_pause_before_play                 int8
hist_user_behavior_n_seekfwd           int8
hist_user_behavior_n_seekback         int16
hist_user_behavior_is_shuffle         uint8
hour_of_day                            int8
premium                               uint8
context_type                          uint8
hist_user_behavior_reason_start       uint8
hist_user_behavior_reason_end         uint8
dtype: object

# Read in track features

In [13]:
!ls data_mini/track_features/tf_mini.csv

data_mini/track_features/tf_mini.csv


In [52]:
df_trk = [pd.read_csv(PATH+'track_features/tf_mini.csv', usecols=range(4), dtype={'release_year': np.uint32})]

In [53]:
df_trk.memory_usage(deep=True)/1024**2

Index                     0.000076
track_id                  4.593735
duration                  0.386841
release_year              0.193420
us_popularity_estimate    0.386841
dtype: float64

In [54]:
df_trk = kg.reduce_mem_usage(df_trk)

Memory usage of dataframe is 5.56 MB
Memory usage after optimization is: 7.58 MB
Decreased by -36.3%


In [55]:
df_trk.head()

Unnamed: 0,track_id,duration,release_year,us_popularity_estimate
0,t_a540e552-16d4-42f8-a185-232bd650ea7d,109.6875,1950,100.0
1,t_67965da0-132b-4b1e-8a69-0ef99b32287c,187.75,1950,100.0
2,t_0614ecd3-a7d5-40a1-816e-156d5872a467,160.875,1951,99.625
3,t_070a63a0-744a-434e-9913-a97b02926a29,175.375,1951,99.6875
4,t_d6990e17-9c31-4b01-8559-47d9ce476df1,369.5,1951,100.0


In [56]:
df_trk.memory_usage(deep=True)/1024**2

Index                     0.000076
track_id                  7.287155
duration                  0.096710
release_year              0.096710
us_popularity_estimate    0.096710
dtype: float64

# Merge DS with Track features

In [63]:
x = df_trn.merge(df_trk, how='left', right_on='track_id', left_on='track_id_clean').drop(['track_id_clean', 'track_id'], axis=1)

In [64]:
x.dtypes

session_id                         category
session_position                       int8
session_length                         int8
skip_1                                uint8
skip_2                                uint8
skip_3                                uint8
not_skipped                           uint8
context_switch                         int8
no_pause_before_play                   int8
short_pause_before_play                int8
long_pause_before_play                 int8
hist_user_behavior_n_seekfwd           int8
hist_user_behavior_n_seekback         int16
hist_user_behavior_is_shuffle         uint8
hour_of_day                            int8
premium                               uint8
context_type                          uint8
hist_user_behavior_reason_start       uint8
hist_user_behavior_reason_end         uint8
duration                            float16
release_year                         uint16
us_popularity_estimate              float16
dtype: object

In [65]:
x.memory_usage(deep=True).sum()/1024**2

6.822090148925781

In [70]:
id_trn = df_trn['session_id'].cat.categories

In [72]:
id_trn_1 = id_trn[:5000]
id_trn_2 = id_trn[5000:]

In [75]:
df_trn_1 = df_trn.query('session_id in @id_trn_1')
df_trn_2 = df_trn.query('session_id in @id_trn_2')

In [81]:
df_trn_1.to_csv(PATH+'training_set/log_mini_1.csv.gz', index=False, compression='gzip')

In [82]:
df_trn_2.to_csv(PATH+'training_set/log_mini_2.csv.gz', index=False, header=False, compression='gzip')