In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['train.csv', 'sample_submission.csv', 'members.csv', 'song_extra_info.csv', 'test.csv', 'songs.csv']


In [2]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import gc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import datetime
import math

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
train = reduce_mem_usage(pd.read_csv('../input/train.csv'))
test = reduce_mem_usage(pd.read_csv('../input/test.csv'))
sei = pd.read_csv('../input/song_extra_info.csv')
members = pd.read_csv('../input/members.csv',parse_dates=['registration_init_time','expiration_date'])
songs = pd.read_csv('../input/songs.csv')

Memory usage of dataframe is 337.71 MB
Memory usage after optimization is: 84.59 MB
Decreased by 75.0%
Memory usage of dataframe is 117.04 MB
Memory usage after optimization is: 44.23 MB
Decreased by 62.2%


In [5]:
print('Shape of train is ->',train.shape)
print('Shape of test is ->',test.shape)
print('Shape of Song Extra Info is ->',sei.shape)
print('Shape of Members is ->',members.shape)
print('Shape of Songs is ->',songs.shape)

Shape of train is -> (7377418, 6)
Shape of test is -> (2556790, 6)
Shape of Song Extra Info is -> (2295971, 3)
Shape of Members is -> (34403, 7)
Shape of Songs is -> (2296320, 7)


In [6]:
# def get_codes(isrc):
#     if isrc!= isrc:
#         return np.nan 
#     else:
#         return [str(isrc)[0:2] , str(isrc)[2:5]  , str(isrc)[5:7] , str(isrc)[7:]]
# sei_null = sei[sei['isrc'].isnull()]
# sei = sei.dropna(subset=['isrc'])
# sei['first_code'] = sei['code'].apply(lambda x: x[0])
# sei['forth_code'] = sei['code'].apply(lambda x: x[3])
# sei['second_code'] = sei['code'].apply(lambda x: x[1])
# sei['third_code'] = sei['code'].apply(lambda x: x[2])
# sei_null['first_code'] = np.nan 
# sei_null['second_code'] = np.nan 
# sei_null['third_code'] = np.nan 
# sei_null['forth_code'] = np.nan 
# sei = pd.concat([sei_null  ,sei] , axis = 0)
# sei.drop(columns = ['code' , 'isrc','name'] , inplace = True)
# sei = sei.sample(frac = 1  , random_state = 98)
# del sei_null

In [7]:
def get_codes(isrc):
    if pd.isnull(isrc):
        return np.nan
    else:
        if int(str(isrc)[5:7]) > 17:
            temp =  1900+int(str(isrc)[5:7])
        else:
            temp = 2000+int(isrc[5:7])
        return temp

In [8]:
sei['year'] = sei['isrc'].apply(lambda x: get_codes(x))
sei.sample(10)

Unnamed: 0,song_id,name,isrc,year
2033060,EDmVC/pQ8a1dNXSQ04bo1UM+SBfl4OyHKRNLJyRC4cc=,換日線,TWA211126405,2011.0
984192,rsIqj87MuloXPiJMfFHsU0+zsb9iWz2quEDBvXc8uME=,萬福瑪利亞,HKH589500030,1995.0
1911304,uHeXge1lSQovH7I/HQ4HbcO5N+8z8+zBdgwxKUOrBBI=,Take off (Intro),TCACG1591465,2015.0
1898232,MgPoPFk4VasCH8tsM9p9Ui+/SBACPB7q+phhBfbJva4=,Nightime,USVI20900114,2009.0
1091305,+UybwXNqKr+eOOGPuvMboKu+GKthzoiB+utpPYdz7Ek=,Guitar Concerto in D Major| RV 93: II. Largo,QMFMG1420212,2014.0
703277,TuUlhKz6iKXr4WrFMSR8w+fSd7UmBE4D6PQ+4mK1x0o=,She's Not There (Santana),USSM17700463,1977.0
589673,bpuLMb2H7fj8DiJrzdx7uWMzzrqEZVsbtBrMGJERI8Y=,Compass,QMFME1405817,2014.0
1258130,hGv/iWf54IRajZLLY5zY/CzkBGlQM2PLC7lUyNa2md0=,Milkshake (113 BPM),GBPS81514831,2015.0
1298550,LzhBmS3Ht1F4MMArJ7GoWKav1UHCyyMc8zyInhVqQvk=,Father's Day,USX9P1071332,2010.0
735428,b0l9x29NW8CioXA9fGA713sDpiNeu/kNOIFi/g6PV88=,Vous,FRS630400059,2004.0


In [9]:
members['membership_days'] = members['expiration_date'].subtract(members['registration_init_time']).dt.days.astype(int)
members['registration_year'] = members['registration_init_time'].dt.year
members['expiration_year'] = members['expiration_date'].dt.year
members.drop(columns = ['registration_init_time' , 'expiration_date'] , inplace = True)
members.head()

Unnamed: 0,msno,city,bd,gender,registered_via,membership_days,registration_year,expiration_year
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,2223,2011,2017
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,,7,725,2015,2017
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,,4,457,2016,2017
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,,9,1,2015,2015
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,,4,138,2017,2017


In [10]:
# Extending columns
# merging the database
train = train.merge(songs , on='song_id' , how='left')
train = train.merge(members , on = 'msno' , how='left')
train = train.merge(sei , on = 'song_id' , how='left')
test  = test.merge(songs , on='song_id' , how='left')
test = test.merge(members , on = 'msno' , how = 'left')
test =  test.merge(sei , on = 'song_id' , how = 'left')
del sei ,members , songs
gc.collect()

182

In [11]:
print(train['song_length'].isnull().value_counts()/train.shape[0])
train['song_length'].fillna(train['song_length'].mean() , inplace = True)
train['song_length'] = train['song_length'].astype(np.uint32)
print(train['language'].isnull().value_counts()/train.shape[0])
train['language'].fillna(train['language'].mode().values[0] , inplace= True)
train['language'] = train['language'].astype(np.int8)
test['song_length'].fillna(test['song_length'].mean() , inplace = True)
test['song_length'] = test['song_length'].astype(np.uint32)
test['language'].fillna(test['language'].mode().values[0] , inplace= True)
test['language'] = test['language'].astype(np.int8)

False    0.999985
True     0.000015
Name: song_length, dtype: float64
False    0.99998
True     0.00002
Name: language, dtype: float64


In [12]:
def genre_count(genre):
    if genre == 'no_genre_id':
        return 0
    else :
        return genre.count('|') + 1
print(train['genre_ids'].isnull().value_counts()/train.shape[0])
train['genre_ids'].fillna('no_genre_id' , inplace= True)
train['genre_ids_count'] = train['genre_ids'].apply(lambda x: genre_count(x)).astype(np.int8)
test['genre_ids'].fillna('no_genre_id' , inplace= True)
test['genre_ids_count'] = test['genre_ids'].apply(lambda x: genre_count(x)).astype(np.int8)
                                                       

False    0.983944
True     0.016056
Name: genre_ids, dtype: float64


In [13]:

def artist_count(art):
    if art=='no_artist_name':
        return 0
    else:
        return art.count('|')+art.count('/') + art.count('//') + art.count(';') + 1
train['artist_name'].isnull().value_counts()
train['artist_name'].fillna('no_artist_name' , inplace = True)
train['artist_count'] = train['artist_name'].apply(lambda x : artist_count(x)).astype(np.int8)
test['artist_name'].fillna('no_artist_name' , inplace = True)
test['artist_count'] = test['artist_name'].apply(lambda x : artist_count(x)).astype(np.int8)

In [14]:
def  count_composer(comp):
    if comp=='no_composer':
        return 0
    else:
        return comp.count('|')+comp.count('/') + comp.count('//') + comp.count(';') + 1
def  count_lyricist(lyr):
    if lyr=='no_lyricist':
        return 0
    else:
        return lyr.count('|')+lyr.count('/') + lyr.count('//') + lyr.count(';') + 1

In [15]:
train['composer'].fillna('no_composer',inplace=True)
train['composer_count'] = train['composer'].apply(lambda x: count_composer(x)).astype(np.int8)
train['lyricist'].fillna('no_lyricist',inplace=True)
train['lyricist_count'] = train['lyricist'].apply(lambda x: count_lyricist(x)).astype(np.int8)
test['composer'].fillna('no_composer',inplace=True)
test['composer_count'] = test['composer'].apply(lambda x: count_composer(x)).astype(np.int8)
test['lyricist'].fillna('no_lyricist',inplace=True)
test['lyricist_count'] = test['lyricist'].apply(lambda x: count_lyricist(x)).astype(np.int8)

In [16]:
dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
dict_count_song_played_test = {k: v for k, v in test['song_id'].value_counts().iteritems()}
def return_number_played(x):
    try:
        return dict_count_song_played_train[x]
    except KeyError:
        try:
            return dict_count_song_played_test[x]
        except KeyError:
            return 0
train['number_of_time_played'] = train['song_id'].apply(lambda x: return_number_played(x))
test['number_of_time_played'] = test['song_id'].apply(lambda x: return_number_played(x))

In [17]:
dict_user_activity = {k:v for k,v in pd.concat([train['msno'] , test['msno']] , axis = 0).value_counts().iteritems()}
def return_user_activity(x):
    try:
        return dict_user_activity[x]
    except KeyError:
        return 0
train['user_activity_msno'] = train['msno'].apply(lambda x: return_user_activity(x))
test['user_activity_msno'] = test['msno'].apply(lambda x: return_user_activity(x))

In [18]:
# f,ax = plt.subplots(figsize=(15, 15))
# sns.countplot(x='artist_count' ,hue= 'target'  , data = train)
# plt.xticks(rotation=90)


In [19]:
train_col = list(train.columns)
test_col = list(test.columns)
for f in test_col :
    if f not in train_col:
        print('ERROR !!!  Column from Test not found in train is ->' , f)
label_encoding = ['source_system_tab', 'source_screen_name',
       'source_type','gender']
drop = ['msno', 'song_id' , 'isrc','artist_name',
       'composer', 'lyricist','name','genre_ids']
min_max_scaling = ['number_of_time_played', 'user_activity_msno','membership_days', 'song_length']

ERROR !!!  Column from Test not found in train is -> id


In [20]:
for f in label_encoding:
    lb = LabelEncoder()
    lb.fit(list(train[f].values) + list(test[f].values))
    train[f] = lb.transform(list(train[f].values))
    test[f] = lb.transform(list(test[f].values))
for f in min_max_scaling:
    ms = MinMaxScaler()
    train[f] = ms.fit_transform(train[[f]])
    test[f] = ms.transform(test[[f]])
# train.drop(columns = drop , inplace = True)
# test.drop(columns=drop , inplace = True)

In [21]:
for col in train.columns:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

In [22]:
train.sample(10)

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,...,expiration_year,name,isrc,year,genre_ids_count,artist_count,composer_count,lyricist_count,number_of_time_played,user_activity_msno
1963730,n4bLu8Q/XUUWDnIFZNAYLnbZk4Pq4qpmHIHJljMO8jc=,CIe8eiKAnJqpbvB6fBgY0WNUsttrMYkSYGOq5k8nDh4=,0,4,10,1,0.021203,1259,Pitbull,no_composer,...,2017,Rain Over Me (Featuring Marc Anthony),USJAY1100071,2011.0,1,1,0,0,0.059476,0.12074
3997486,RFJf7tPngP4UFBmHgv6lI7hGu2iwim/KTyf/lcK9RWk=,+T2j6Ss6VfrCnwSrpy+cWs7G8UrzLEzk6I8LmGNDtxY=,3,8,4,0,0.018624,465,Katy Perry,Katy Perry| Max Martin| Lukasz Gottwald| Bonni...,...,2017,Ghost,USUM71311299,2013.0,1,1,5,0,0.003221,0.083238
1829923,eRrjRT+CBYy8ZLcPTmpQN1yneH4tjDuHyBGqeJhyuww=,PeoNUlUkZ0RTu4zbYdO6HbRqG/m5qMBqavKRutEfEtw=,3,8,3,1,0.02066,921,沒關係 是愛情啊 電視原聲帶 Volume 1,Cho Young-su,...,2017,最佳的幸運 (Sung by CHEN(EXO)),KRA491401696,2014.0,1,1,1,2,0.116662,0.031674
6522138,uU+E4hbQhwKeJ8Cp3LrC482QpogxSZIs19BGyxNB1RU=,2pidGLi+aKtlIFZb0z7luUNgNv3O4cvXM0uDM27ntIw=,0,5,7,0,0.035165,958,Berliner Philharmoniker|Ferenc Fricsay,Ludwig van Beethoven,...,2017,3. Allegro,DEF056101993,1961.0,1,2,1,0,0.000215,0.084125
5448364,fDGKxxbD6qximlX23UV/eGNy+laemVGA+U3mUZcnQgk=,tYG3uApZ+sDT+TqBH4gT/BUPhmAhaOtQWmylZsKRxbY=,2,12,2,0,0.023133,465,郭采潔 (Amber Kuo),Chen Pin Ying,...,2017,隱形超人,TWA530799305,2007.0,1,1,1,1,0.013599,0.068415
307792,Y41GcVxq72EkP7oLXL3VnpHPdWys9e9eBtfxBK3DZ0g=,IKMFuL0f5Y8c63Hg9BXkeNJjE0z8yf3gMt/tOxF4QNE=,3,8,3,1,0.022449,1609,The Chainsmokers,Andrew Taggart| Shaun Frank| Frederic Kennett|...,...,2017,Closer,USQX91601347,2016.0,1,1,6,6,0.70448,0.097175
7249624,ci41bqkhY6MyUE72/j8Ts1XWQ9cFH0Wl0gzq/Fi9Pic=,ze0+7p13wqB1hZ/KWd6OT42LBehKmYVQyGGzEv8Q4fA=,0,11,7,0,0.02342,921,Sam Kim,Nam Hyeseung| Park Jinho,...,2017,Who are you,TWA531604022,2016.0,1,1,2,2,0.309619,0.074243
2263210,BGo4Oilb0i8FCeZOLUPIFGkW3RWDwNuMoixfq+Mmekk=,MZzZ/DoA1AH9/gh1ivsFX9Zodmn+4teGYo2zJsodjZs=,0,22,11,1,0.017321,786,宮崎駿映畫名曲集,Hayao Miyazaki| Joe Hisaishi,...,2017,魔法公主/選自《魔法公主》,,,1,1,2,0,0.003078,0.152667
3232110,j++LtgsZHuZIYdUVrQm2h4aOm9qlGzxJeVPAtxbfQHg=,GLR69SzPJ0WDH6XTfcJykmURiU52ixcSbBTjRxdujIw=,6,16,8,0,0.038161,1609,Tim Le Funk,no_composer,...,2017,Acid Rocker (Original Mix),QMFMG1347085,2013.0,1,1,0,0,0.000143,0.230584
629340,UjuPJCQo/YoGVR/yJ+CocmWUk2czJE0BBKqaPpVYV64=,tx4bp4QtinXe00DH2ExB//8YA5e/IG08PjzsyNf9Y1Y=,7,11,7,1,0.020432,465,孫燕姿 (Yanzi Sun),Guo Zi,...,2017,原來你什麼都不要 (Nothing You Want),TWA530112026,2001.0,1,1,1,2,0.017821,0.310275


In [23]:
X_train = train.drop(columns = ['target'] , axis = 1)
Y_train = train['target'].values
X_test = test.drop(columns = ['id'] , axis = 1)
ids = test['id'].values
del train , test
gc.collect()
train_set = lgb.Dataset(X_train , Y_train)

In [24]:
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc'
    }

%time model_f1 = lgb.train(params, train_set=train_set,  valid_sets=train_set, verbose_eval=5)

[5]	training's auc: 0.737126
[10]	training's auc: 0.749407
[15]	training's auc: 0.755688
[20]	training's auc: 0.759566
[25]	training's auc: 0.763375
[30]	training's auc: 0.765888
[35]	training's auc: 0.76884
[40]	training's auc: 0.772178
[45]	training's auc: 0.774444
[50]	training's auc: 0.776479
[55]	training's auc: 0.778661
[60]	training's auc: 0.780458
[65]	training's auc: 0.781851
[70]	training's auc: 0.783171
[75]	training's auc: 0.784952
[80]	training's auc: 0.786049
[85]	training's auc: 0.787828
[90]	training's auc: 0.789104
[95]	training's auc: 0.790276
[100]	training's auc: 0.791317
[105]	training's auc: 0.792277
[110]	training's auc: 0.793312
[115]	training's auc: 0.794213
[120]	training's auc: 0.795004
[125]	training's auc: 0.795812
[130]	training's auc: 0.797736
[135]	training's auc: 0.798894
[140]	training's auc: 0.799944
[145]	training's auc: 0.80079
[150]	training's auc: 0.801536
[155]	training's auc: 0.802164
[160]	training's auc: 0.802698
[165]	training's auc: 0.803445

In [25]:
pred_test = model_f1.predict(X_test)
print('Saving Predictions')
sub = pd.DataFrame()
sub['id'] = ids
sub['target'] = pred_test
sub.to_csv('1st_submission.csv' , index = False , float_format ='%.5f' )

Saving Predictions


In [26]:
sub.head()

Unnamed: 0,id,target
0,0,0.516963
1,1,0.426365
2,2,0.191099
3,3,0.100885
4,4,0.09144
