https://www.kaggle.com/c/kkbox-music-recommendation-challenge/data

In [1]:
import pickle as pkl

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

from src.data_loader import *

# Load data

## Load song and user IDs

In [2]:
song_ids = load_song_ids()
song_ids.head()

Unnamed: 0_level_0,song_id
id,Unnamed: 1_level_1
1,SOAAADD12AB018A9DD
2,SOAAADE12A6D4F80CC
3,SOAAADF12A8C13DF62
4,SOAAADZ12A8C1334FB
5,SOAAAFI12A6D4F9C66


In [3]:
user_ids = load_user_ids()
user_ids.head()

Unnamed: 0,user_id
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d
1,d7083f5e1d50c264277d624340edaaf3dc16095b
2,d68dc6fc25248234590d7668a11e3335534ae4b4
3,9be82340a8b5ef32357fe5af957ccd54736ece95
4,841b2394ae3a9febbd6b06497b4a8ee8eb24b7f8


## Load song-to-track data

In [4]:
song_to_track_data = load_song_to_track_data(progress_bar=True)
song_to_track_data.head()

100%|██████████| 386213/386213 [00:00<00:00, 462008.39it/s]


Unnamed: 0,song_id,track_ids
0,SOAAADD12AB018A9DD,[TRNCENP12903C9EF3A]
1,SOAAADE12A6D4F80CC,[TRSKKFK128F148B615]
2,SOAAADF12A8C13DF62,[TRCQMSP128F428A6F7]
3,SOAAADZ12A8C1334FB,[TRMDNZY128F425A532]
4,SOAAAFI12A6D4F9C66,[TRZEXLQ128F1491D17]


## Load train triplets

In [5]:
train_data = DataLoaderDF(which='train')

# Data exploration

## Gather some basic stats

For each user, determine:
* the number of songs the user has played before
* the average number of plays per song the user has done

For each song, determine:
* the total number of users that have played the song
* the average number of times per user it has been played

In [6]:
save_frequency = 100000

# ====================================================================

user_stats = dict()
song_stats = dict()

for r, row in tqdm(enumerate(train_data)):
    user = row['user_id']
    song = row['song_id']
    num_plays = row['num_plays']
    
    # Make new entries if we are seeing a new song and/or user
    if user not in user_stats:
        user_stats[user] = dict()
        user_stats[user]['num_songs_played'] = 0
        user_stats[user]['avg_plays_per_song'] = 0
        
    if song not in song_stats:
        song_stats[song] = dict()
        song_stats[song]['num_users_played_by'] = 0
        song_stats[song]['avg_plays_per_user'] = 0
    
    # =================================================
    u_stats = user_stats[user]
    prev_n = u_stats['num_songs_played']
    u_stats['num_songs_played'] += 1
    
    # update average plays per song
    u_stats['avg_plays_per_song'] *= prev_n
    u_stats['avg_plays_per_song'] += num_plays
    u_stats['avg_plays_per_song'] /= u_stats['num_songs_played']
    # =======================
    
    s_stats = song_stats[song]
    prev_n = s_stats['num_users_played_by']
    s_stats['num_users_played_by'] += 1
    
    # update average plays per user
    s_stats['avg_plays_per_user'] *= prev_n
    s_stats['avg_plays_per_user'] += num_plays
    s_stats['avg_plays_per_user'] /= s_stats['num_users_played_by']
    
    # =================================================
    
    # Save intermediate results every so often
    if (r != 0) and (r % save_frequency) == 0:
        user_stats['num_processed'] = r
        song_stats['num_processed'] = r
        
        user_save_path_tmp = './data/user_stats_TEMP.pkl'
        with open(user_save_path_tmp, 'wb') as f:
            pkl.dump(user_stats, f)

        song_save_path_tmp = './data/song_stats_TEMP.pkl'
        with open(song_save_path_tmp, 'wb') as f:
            pkl.dump(song_stats, f)
            
# ====================================================================
            
# Remove 'num_processed' entry
user_stats.pop('num_processed', None)
user_stats.pop('num_processed', None)

# Save final results    
user_save_path = './data/user_stats.pkl'
with open(user_save_path, 'wb') as f:
    pkl.dump(user_stats, f)
    
song_save_path = './data/song_stats.pkl'
with open(song_save_path, 'wb') as f:
    pkl.dump(song_stats, f)

2134206it [08:09, 4236.39it/s]

KeyboardInterrupt: 