#1. Mount and set directory & import packages

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#Importing libraries
import numpy as np
import pandas as pd
import lightgbm as lgb
import datetime
import math
import gc
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import random
import matplotlib.pyplot as plt
import graphviz
import warnings
warnings.filterwarnings('ignore')

#2. Data pipeline from kaggle (First time ONLY)

In [None]:
!pip install kaggle
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/ #save kaggle json key in my drive
!ls ~/.kaggle
!mkdir -p ./kaggle/
!chmod 600 /root/.kaggle/kaggle.json #600 --> only the owner of the file has full read and write access to it.
!kaggle  competitions download kkbox-music-recommendation-challenge
!unzip /content/kkbox-music-recommendation-challenge.zip

#Create folder
!mkdir kaggle/working
!mkdir kaggle/working/train
!mkdir kaggle/working/train/data
#Unzip
!apt-get install p7zip
!apt-get install p7zip-full 
!7za e members.csv.7z 
!7za e songs.csv.7z 
!7za e song_extra_info.csv.7z 
!7za e train.csv.7z 
!7za e sample_submission.csv.7z 
!7za e test.csv.7z 
#Moving file
!mv *.csv kaggle/working/train/data

kaggle.json
Downloading kkbox-music-recommendation-challenge.zip to /content
 95% 329M/345M [00:02<00:00, 95.5MB/s]
100% 345M/345M [00:03<00:00, 121MB/s] 
Archive:  /content/kkbox-music-recommendation-challenge.zip
  inflating: members.csv.7z          
  inflating: sample_submission.csv.7z  
  inflating: song_extra_info.csv.7z  
  inflating: songs.csv.7z            
  inflating: test.csv.7z             
  inflating: train.csv.7z            
Reading package lists... Done
Building dependency tree       
Reading state information... Done
p7zip is already the newest version (16.02+dfsg-6).
p7zip set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
p7zip-full is already the newest version (16.02+dfsg-6).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.

7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 

In [None]:
#Read from Kaggle dir
root = './kaggle/working/train/data/'
train_df = pd.read_csv(root+ "train.csv")
test_df = pd.read_csv(root+ "test.csv")
song_df = pd.read_csv(root+ "songs.csv")
song_extra_df = pd.read_csv(root+ "song_extra_info.csv")
members_df = pd.read_csv(root+ "members.csv")

In [None]:
#Save to my drive

path1 = '/content/drive/My Drive/Dataset/kkbox/'

with open(path1+'train.csv', 'w', encoding = 'utf-8-sig') as f:
  train_df.to_csv(f, index=False)

with open(path1+'test.csv', 'w', encoding = 'utf-8-sig') as f:
  test_df.to_csv(f, index=False)

with open(path1+'members.csv', 'w', encoding = 'utf-8-sig') as f:
  members_df.to_csv(f, index=False)

with open(path1+'songs.csv', 'w', encoding = 'utf-8-sig') as f:
  song_df.to_csv(f, index=False)

with open(path1+'song_extra_info.csv', 'w', encoding = 'utf-8-sig') as f:
  song_extra_df.to_csv(f, index=False)

#3. Data processing


## 3.1 Data Transformation

In [None]:
print('Loading data...')
# Read data 
root = '/content/drive/MyDrive/Dataset/kkbox/'


train = pd.read_csv(root+'train.csv', dtype={'msno' : 'object',
                                                 'source_system_tab' : 'object',
                                                 'source_screen_name' : 'object',
                                                 'source_type' : 'object',
                                                 'target' : np.uint8,
                                                 'song_id' : 'object'})
test = pd.read_csv(root+'test.csv', dtype={'msno' : 'object',
                                               'source_system_tab' : 'object',
                                               'source_screen_name' : 'object',
                                               'source_type' : 'object',
                                               'song_id' : 'object'})
songs = pd.read_csv(root+'songs.csv',dtype={'genre_ids': 'object',
                                                    'language' : 'object',
                                                    'artist_name' : 'object',
                                                    'composer' : 'object',
                                                    'lyricist' : 'object',
                                                    'song_id' : 'object'})
members = pd.read_csv(root+'members.csv',dtype={'city' : 'object',
                                                    'bd' : np.uint8,
                                                    'gender' : 'object',
                                                    'registered_via' : 'object'},
                     parse_dates=['registration_init_time','expiration_date'])
songs_extra = pd.read_csv(root+'song_extra_info.csv')
print('Done loading...')

Loading data...
Done loading...


In [None]:
songs_extra.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2295971 entries, 0 to 2295970
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   song_id  object
 1   name     object
 2   isrc     object
dtypes: object(3)
memory usage: 52.6+ MB


In [None]:
def object2cat(df):
    object_cols = list(df.select_dtypes(include=['object']).columns)
    for col in object_cols:
        df[col]=df[col].astype('category')
object2cat(train)
object2cat(test)
object2cat(songs)
object2cat(members)
object2cat(songs_extra)

In [None]:
#Remove outlier of age by replacing null
members['bd'].loc[(members['bd']<=0) | (members['bd']>=100)]= np.nan

# months and years from expiratin and registration time 
members['membership_days'] = members['expiration_date'].subtract(
    members['registration_init_time']).dt.days.astype(int)

members['registration_year'] = members['registration_init_time'].dt.year
members['registration_month'] = members['registration_init_time'].dt.month
members['registration_date'] = members['registration_init_time'].dt.day

members['expiration_year'] = members['expiration_date'].dt.year
members['expiration_month'] = members['expiration_date'].dt.month
members['expiration_date'] = members['expiration_date'].dt.day
members = members.drop(['registration_init_time'], axis=1)

In [None]:
# Convert isrc id to year
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

#extracting the year of the song from isrc        
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)

#Dropping isrc and name from songs_extra
songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

In [None]:
songs_extra.head()

Unnamed: 0,song_id,song_year
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,2012.0
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,2016.0
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,2008.0
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,2013.0
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,2013.0


In [None]:
songs.song_length.isnull().sum()

0

In [None]:
#Fill NA (song length)
songs.song_length.fillna(200000,inplace=True)
songs.song_length = songs.song_length.astype(np.uint32)
'''songs.song_id = songs.song_id.astype('category')
test.song_length.fillna(200000,inplace=True)
test.song_length = test.song_length.astype(np.uint32)
test.song_id = test.song_id.astype('category')'''

"songs.song_id = songs.song_id.astype('category')\ntest.song_length.fillna(200000,inplace=True)\ntest.song_length = test.song_length.astype(np.uint32)\ntest.song_id = test.song_id.astype('category')"

## 3.2 Data Merging

In [None]:
# Merging song with train and test dataframes
train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')
train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')
train = train.merge(songs_extra, on = 'song_id', how = 'left')
test = test.merge(songs_extra, on = 'song_id', how = 'left')

del members, songs; gc.collect() # release memory
object2cat(train)



## 3.3 Feature extraction

In [None]:
#counting genre_id
def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1


#filling NA in place of null values
train['genre_ids'].cat.add_categories('no_genre_id').fillna('no_genre_id',inplace=True)
test['genre_ids'].cat.add_categories('no_genre_id').fillna('no_genre_id',inplace=True)
train['genre_ids_count'] = train['genre_ids'].apply(genre_id_count).astype(float)
test['genre_ids_count'] = test['genre_ids'].apply(genre_id_count).astype(float)


# Splitting the lyricists by ['|', '/', '\\', ';'] and counting the number of Lyricists
def lyricist_count(x):
    if x == 'no_lyricist':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    return sum(map(x.count, ['|', '/', '\\', ';']))

train['lyricist'].cat.add_categories('no_lyricist').fillna('no_lyricist',inplace=True)
test['lyricist'].cat.add_categories('no_lyricist').fillna('no_lyricist',inplace=True)
train['lyricists_count'] = train['lyricist'].apply(lyricist_count).astype(float)
test['lyricists_count'] = test['lyricist'].apply(lyricist_count).astype(float)


# Splitting the composer by ['|', '/', '\\', ';'] and counting the number of Lyricists
def composer_count(x):
    if x == 'no_composer':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

train['composer'].cat.add_categories('no_composer').fillna('no_composer',inplace=True)
test['composer'].cat.add_categories('no_composer').fillna('no_composer',inplace=True)
train['composer_count'] = train['composer'].apply(composer_count).astype(float)
test['composer_count'] = test['composer'].apply(composer_count).astype(float)

# Checking for feat in the column value
def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0

train['artist_name'].cat.add_categories('no_artist').fillna('no_artist',inplace=True)
test['artist_name'].cat.add_categories('no_artist').fillna('no_artist',inplace=True)
train['is_featured'] = train['artist_name'].apply(is_featured).astype(float)
test['is_featured'] = test['artist_name'].apply(is_featured).astype(float)

# Splitting the artists by [and, ',', feat, &] and counting the number of artists
def artist_count(x):
    if x == 'no_artist':
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&')

train['artist_count'] = train['artist_name'].apply(artist_count).astype(float)
test['artist_count'] = test['artist_name'].apply(artist_count).astype(float)

# if artist is same as composer
train['artist_composer'] = (train['artist_name'].astype("object") == train['composer']
                            .astype("object")).astype(np.int8)
test['artist_composer'] = (test['artist_name'].astype("object") == test['composer']
                           .astype("object")).astype(np.int8)

# if artist, lyricist and composer are all three same
train['artist_composer_lyricist'] = ((train['artist_name'].astype("object") 
                                      == train['composer'].astype("object")) 
                                     & (train['artist_name'].astype("object")
                                        == train['lyricist'].astype("object"))
                                     & (train['composer'].astype("object")
                                        == train['lyricist'].astype("object"))
                                    ).astype(np.int8)
test['artist_composer_lyricist'] = ((test['artist_name'].astype("object") 
                                     == test['composer'].astype("object")) 
                                    & (test['artist_name'].astype("object")
                                       == test['lyricist'].astype("object"))
                                    & (test['composer'].astype("object") 
                                       == test['lyricist'].astype("object"))
                                   ).astype(np.int8)

# is song language 17 or 45. 
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0

train['song_lang_boolean'] = train['language'].apply(song_lang_boolean).astype(float)
test['song_lang_boolean'] = test['language'].apply(song_lang_boolean).astype(float)

_mean_song_length = np.mean(train['song_length'])
def smaller_song(x):
    if x < _mean_song_length:
        return 1
    return 0

train['smaller_song'] = train['song_length'].apply(smaller_song).astype(float)
test['smaller_song'] = test['song_length'].apply(smaller_song).astype(float)

# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train['song_id']
                                 .value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test['song_id']
                                .value_counts().iteritems()}

def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0
    
train['count_song_played'] = train['song_id'].apply(count_song_played).astype(float)
test['count_song_played'] = test['song_id'].apply(count_song_played).astype(float)

# number of times the artist has been played
_dict_count_artist_played_train = {k: v for k, v in train['artist_name']
                                   .value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test['artist_name']
                                  .value_counts().iteritems()}

def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0

train['count_artist_played'] = train['artist_name'].apply(count_artist_played).astype(float)
test['count_artist_played'] = test['artist_name'].apply(count_artist_played).astype(float)


print("Done adding features")

Done adding features


# Save processed data

In [None]:
path1 = '/content/drive/My Drive/Dataset/kkbox/'

with open(path1+'processed_kkbox.csv', 'w', encoding = 'utf-8-sig') as f:
  train.to_csv(f, index=False)

with open(path1+'test_kkbox.csv', 'w', encoding = 'utf-8-sig') as f:
  test.to_csv(f, index=False)
