Task 7 - Light GBM on KKBox's Music Recommendation System

In [3]:
pip install Lightgbm

Collecting Lightgbm
  Downloading lightgbm-3.3.2-py3-none-win_amd64.whl (1.0 MB)
Installing collected packages: Lightgbm
Successfully installed Lightgbm-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
#Importing libraries
import numpy as np
import pandas as pd
import lightgbm as lgb
import datetime
import math
import gc
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import random
import matplotlib.pyplot as plt
import graphviz
import warnings
warnings.filterwarnings('ignore')

In [2]:

train = pd.read_csv('train.csv', dtype={'msno' : 'object',
                                                 'source_system_tab' : 'object',
                                                 'source_screen_name' : 'object',
                                                 'source_type' : 'object',
                                                 'target' : np.uint8,
                                                 'song_id' : 'object'})
test = pd.read_csv('test.csv', dtype={'msno' : 'object',
                                               'source_system_tab' : 'object',
                                               'source_screen_name' : 'object',
                                               'source_type' : 'object',
                                               'song_id' : 'object'})
songs = pd.read_csv('songs.csv',dtype={'genre_ids': 'object',
                                                    'language' : 'object',
                                                    'artist_name' : 'object',
                                                    'composer' : 'object',
                                                    'lyricist' : 'object',
                                                    'song_id' : 'object'})
members = pd.read_csv('members.csv',dtype={'city' : 'object',
                                                    'bd' : np.uint8,
                                                    'gender' : 'object',
                                                    'registered_via' : 'object'},
                     parse_dates=['registration_init_time','expiration_date'])
songs_extra = pd.read_csv('song_extra_info.csv')


In [3]:
def object2cat(df):
    object_cols = list(df.select_dtypes(include=['object']).columns)
    for col in object_cols:
        df[col]=df[col].astype('category')
object2cat(train)
object2cat(test)
object2cat(songs)
object2cat(members)

In [4]:
assert train['msno'].dtype == 'category'
assert test['source_system_tab'].dtype == 'category'
assert songs['language'].dtype == 'category'
assert members['city'].dtype == 'category'

In [5]:
# Merging song with train and test dataframes
train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

# Data Extraction/Transformation - Extracting days, 
# months and years from expiratin and registration time 
members['membership_days'] = members['expiration_date'].subtract(
    members['registration_init_time']).dt.days.astype(int)

members['registration_year'] = members['registration_init_time'].dt.year
members['registration_month'] = members['registration_init_time'].dt.month
members['registration_date'] = members['registration_init_time'].dt.day

members['expiration_year'] = members['expiration_date'].dt.year
members['expiration_month'] = members['expiration_date'].dt.month
members['expiration_date'] = members['expiration_date'].dt.day
members = members.drop(['registration_init_time'], axis=1)


# Converting isrc id to year
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

#extracting the year of the song from isrc        
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)

#Dropping isrc and name from songs_extra
songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

train = train.merge(members, on='msno', how='left')
train.msno = train.msno.astype('category')
test = test.merge(members, on='msno', how='left')
test.msno = test.msno.astype('category')

train = train.merge(songs_extra, on = 'song_id', how = 'left')
train.song_length.fillna(200000,inplace=True)
train.song_length = train.song_length.astype(np.uint32)
train.song_id = train.song_id.astype('category')


test = test.merge(songs_extra, on = 'song_id', how = 'left')
test.song_length.fillna(200000,inplace=True)
test.song_length = test.song_length.astype(np.uint32)
test.song_id = test.song_id.astype('category')

#deleting redundant dataframes

del members, songs; gc.collect();


In [6]:
assert len(songs_extra['song_year']) != 0

In [7]:
def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1


#filling NA in place of null values
train['genre_ids'].cat.add_categories('no_genre_id').fillna('no_genre_id',inplace=True)
test['genre_ids'].cat.add_categories('no_genre_id').fillna('no_genre_id',inplace=True)
train['genre_ids_count'] = train['genre_ids'].apply(genre_id_count).astype(object)
test['genre_ids_count'] = test['genre_ids'].apply(genre_id_count).astype(object)


# Splitting the lyricists by ['|', '/', '\\', ';'] and counting the number of Lyricists
def lyricist_count(x):
    if x == 'no_lyricist':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    return sum(map(x.count, ['|', '/', '\\', ';']))

train['lyricist'].cat.add_categories('no_lyricist').fillna('no_lyricist',inplace=True)
test['lyricist'].cat.add_categories('no_lyricist').fillna('no_lyricist',inplace=True)
train['lyricists_count'] = train['lyricist'].apply(lyricist_count).astype(object)
test['lyricists_count'] = test['lyricist'].apply(lyricist_count).astype(object)


# Splitting the composer by ['|', '/', '\\', ';'] and counting the number of Lyricists
def composer_count(x):
    if x == 'no_composer':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1


train['composer'].cat.add_categories('no_composer').fillna('no_composer',inplace=True)
test['composer'].cat.add_categories('no_composer').fillna('no_composer',inplace=True)
train['composer_count'] = train['composer'].apply(composer_count).astype(object)
test['composer_count'] = test['composer'].apply(composer_count).astype(object)

# Checking for feat in the column value
def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0

train['artist_name'].cat.add_categories('no_artist').fillna('no_artist',inplace=True)
test['artist_name'].cat.add_categories('no_artist').fillna('no_artist',inplace=True)
train['is_featured'] = train['artist_name'].apply(is_featured).astype(object)
test['is_featured'] = test['artist_name'].apply(is_featured).astype(object)

# Splitting the artists by [and, ',', feat, &] and counting the number of artists
def artist_count(x):
    if x == 'no_artist':
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&')

train['artist_count'] = train['artist_name'].apply(artist_count).astype(object)
test['artist_count'] = test['artist_name'].apply(artist_count).astype(object)

In [8]:
#testing genre_id_count
assert len(train['genre_ids_count']) != 0
assert len(test['genre_ids_count']) != 0

#testing composer_count
assert len(train['composer_count']) != 0
assert len(train['composer_count']) != 0

#testing lyricist_count
assert len(train['lyricists_count']) != 0
assert len(test['lyricists_count']) != 0

#testing is_feat
assert len(train['is_featured']) != 0
assert len(test['is_featured']) != 0

#testing artist_count
assert len(train['artist_count']) != 0
assert len(test['artist_count']) != 0

In [10]:
train['artist_composer'] = (train['artist_name'].astype("object") == train['composer']
                            .astype("object")).astype(object)
test['artist_composer'] = (test['artist_name'].astype("object") == test['composer']
                           .astype("object")).astype(object)

# if artist, lyricist and composer are all three same
train['artist_composer_lyricist'] = ((train['artist_name'].astype("object") 
                                      == train['composer'].astype("object")) 
                                     & (train['artist_name'].astype("object")
                                        == train['lyricist'].astype("object"))
                                     & (train['composer'].astype("object")
                                        == train['lyricist'].astype("object"))
                                    ).astype(object)
test['artist_composer_lyricist'] = ((test['artist_name'].astype("object") 
                                     == test['composer'].astype("object")) 
                                    & (test['artist_name'].astype("object")
                                       == test['lyricist'].astype("object"))
                                    & (test['composer'].astype("object") 
                                       == test['lyricist'].astype("object"))
                                   ).astype(object)

# is song language 17 or 45. 
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0

train['song_lang_boolean'] = train['language'].apply(song_lang_boolean).astype(object)
test['song_lang_boolean'] = test['language'].apply(song_lang_boolean).astype(object)

_mean_song_length = np.mean(train['song_length'])
def smaller_song(x):
    if x < _mean_song_length:
        return 1
    return 0

train['smaller_song'] = train['song_length'].apply(smaller_song).astype(object)
test['smaller_song'] = test['song_length'].apply(smaller_song).astype(object)

# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train['song_id']
                                 .value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test['song_id']
                                .value_counts().iteritems()}

def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0
    
train['count_song_played'] = train['song_id'].apply(count_song_played).astype(object)
test['count_song_played'] = test['song_id'].apply(count_song_played).astype(object)

# number of times the artist has been played
_dict_count_artist_played_train = {k: v for k, v in train['artist_name']
                                   .value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test['artist_name']
                                  .value_counts().iteritems()}

def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0

train['count_artist_played'] = train['artist_name'].apply(count_artist_played).astype(object)
test['count_artist_played'] = test['artist_name'].apply(count_artist_played).astype(object)


In [11]:
#testing song_lang_boolean
assert train['song_lang_boolean'][0] == 0
assert train['song_lang_boolean'][58] == 1

#testing smaller_song
assert len(test['smaller_song']) != 0
assert len(train['smaller_song']) != 0

#testing count_song_played
assert 'count_song_played' in train
assert len(train['count_artist_played']) != 0
assert 'count_song_played' in test
assert len(test['count_artist_played']) != 0

In [12]:
train.to_csv("processed_train_1.csv") #Saving file to local
test.to_csv("test_1.csv") #Saving test file to local

In [14]:
gc.collect()

30

In [4]:
data = pd.read_csv("processed_train_1.csv")

In [16]:
train = data.drop(data.columns[0], axis=1)

In [17]:
for col in train.columns:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')        

y_train = train['target'].values
X_train = train.drop(['target'], axis=1)

In [18]:
del train
gc.collect()

45

In [19]:
from sklearn.model_selection import train_test_split
X_train_all, X_test, y_train_all, y_test = train_test_split(X_train, y_train,
                                                            test_size=0.2)

X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all,
                                                  test_size=0.2)

In [20]:
idx = random.sample(range(0,X_train.shape[0]), 500000)
y_train = pd.DataFrame(y_train)
X_train_sampled = X_train.iloc[idx]
y_train_sampled = y_train.iloc[idx]

In [1]:
model = lgb.sklearn.LGBMClassifier(objective='binary',
                                         eval_metric='binary_logloss',
                                         boosting='gbdt',
                                         learning_rate=0.3,
                                         verbose=0,
                                         max_depth= 2)

model.fit(X_train, y_train)

NameError: name 'lgb' is not defined

In [None]:
predicted = sk_reg.predict(X_val)
accuracy = accuracy_score(y_val, predicted)
print(f'Mean accuracy score: {accuracy:.3}')

In [None]:
def lgb_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

fi = lgb_feat_importance(sk_reg, X_train_sampled ); 
print(fi[:10])

def plot_fi(fi):return fi.plot('cols', 'imp', 'barh',
                               figsize=(12,7), legend=False)
plot_fi(fi[:30])
plt.show()

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, support = score(y_test, predicted)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))