In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import datetime
import math
import gc

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
songs = pd.read_csv('songs1.csv')
members = pd.read_csv('members.csv')
songs_extra = pd.read_csv('song_extra_info.csv')

song_cols = ['song_id', 'song_length', 'genre_ids', 'artist_name', 'composer',\
            'lyricist', 'language']

train = train.merge(songs[song_cols], on='song_id', how='left')
test = test.merge(songs[song_cols], on='song_id', how='left')

members['registration_year'] = members['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
members['registration_month'] = members['registration_init_time'].apply(lambda x: int(str(x)[4:6]))
members['registration_date'] = members['registration_init_time'].apply(lambda x: int(str(x)[6:8]))

members['expiration_year'] = members['expiration_date'].apply(lambda x: int(str(x)[0:4]))
members['expiration_month'] = members['expiration_date'].apply(lambda x: int(str(x)[4:6]))
members['expiration_date'] = members['expiration_date'].apply(lambda x: int(str(x)[6:8]))

members_cols = members.columns
train = train.merge(members[members_cols], on='msno', how='left')
test = test.merge(members[members_cols], on='msno', how='left')
train = train.merge(songs_extra, on = 'song_id', how = 'left')
test = test.merge(songs_extra, on = 'song_id', how = 'left')

train['msno'] = train['msno'].astype('category')
train['source_system_tab'] = train['source_system_tab'].astype('category')
train['source_screen_name'] = train['source_screen_name'].astype('category')
train['source_type'] = train['source_type'].astype('category')
train['target'] = train['target'].astype(np.uint8)
train['song_id'] = train['song_id'].astype('category')
train['genre_ids'] = train['genre_ids'].astype('category')
train['language'] = train['language'].astype('category')
train['artist_name'] = train['artist_name'].astype('category')
train['composer'] = train['composer'].astype('category')
train['lyricist'] = train['lyricist'].astype('category')
train['song_length'] = train['song_length'].astype(np.float64)
train['city'] = train['city'].astype('category')
train['bd'] = train['bd'].astype(np.uint8)
train['gender'] = train['gender'].astype('category')
train['registered_via'] = train['registered_via'].astype('category')
train['registration_init_time'] = pd.to_datetime(train['registration_init_time'])
train['expiration_date'] = pd.to_datetime(train['expiration_date'])
train['membership_days'] = train['expiration_date'].subtract(train['registration_init_time']).dt.days.astype(int)

train['registration_year'] = train['registration_init_time'].dt.year
train['registration_month'] = train['registration_init_time'].dt.month
train['registration_date'] = train['registration_init_time'].dt.day

train['expiration_year'] = train['expiration_date'].dt.year
train['expiration_month'] = train['expiration_date'].dt.month
train['expiration_date'] = train['expiration_date'].dt.day
train = train.drop(['registration_init_time'], axis=1)

In [4]:
test['msno'] = test['msno'].astype('category')
test['source_system_tab'] = test['source_system_tab'].astype('category')
test['source_screen_name'] = test['source_screen_name'].astype('category')
test['source_type'] = test['source_type'].astype('category')
test['song_id'] = test['song_id'].astype('category')

test['genre_ids'] = test['genre_ids'].astype('category')
test['language'] = test['language'].astype('category')
test['artist_name'] = test['artist_name'].astype('category')
test['composer'] = test['composer'].astype('category')
test['lyricist'] = test['lyricist'].astype('category')
test['song_length'] = test['song_length'].astype(np.float64)

test['city'] = test['city'].astype('category')
test['bd'] = test['bd'].astype(np.uint8)
test['gender'] = test['gender'].astype('category')
test['registered_via'] = test['registered_via'].astype('category')
test['registration_init_time'] = pd.to_datetime(test['registration_init_time'])
test['expiration_date'] = pd.to_datetime(test['expiration_date'])
test['membership_days'] = test['expiration_date'].subtract(test['registration_init_time']).dt.days.astype(int)

test['registration_year'] = test['registration_init_time'].dt.year
test['registration_month'] = test['registration_init_time'].dt.month
test['registration_date'] = test['registration_init_time'].dt.day

test['expiration_year'] = test['expiration_date'].dt.year
test['expiration_month'] = test['expiration_date'].dt.month
test['expiration_date'] = test['expiration_date'].dt.day
test = test.drop(['registration_init_time'], axis=1)

In [None]:
print train['registration_init_time'].head(2)

In [5]:
print('Data merging...')

def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan
        
train['song_year'] = train['isrc'].apply(isrc_to_year)
train.drop(['isrc', 'song_name'], axis = 1, inplace = True)
test['song_year'] = test['isrc'].apply(isrc_to_year)
test.drop(['isrc', 'song_name'], axis = 1, inplace = True)

Data merging...


In [6]:
print('Data merging...')

train.song_length.fillna(200000,inplace=True)
train.song_length = train.song_length.astype(np.uint32)
train.song_id = train.song_id.astype('category')

test.song_length.fillna(200000,inplace=True)
test.song_length = test.song_length.astype(np.uint32)
test.song_id = test.song_id.astype('category')

# import gc
# del members, songs; gc.collect();

print('Done merging...')

Data merging...
Done merging...


In [8]:
print train.dtypes

msno                  category
song_id               category
source_system_tab     category
source_screen_name    category
source_type           category
target                   uint8
song_length             uint32
genre_ids             category
artist_name           category
composer              category
lyricist              category
language              category
city                  category
bd                       uint8
gender                category
registered_via        category
expiration_date          int64
registration_year        int64
registration_month       int64
registration_date        int64
expiration_year          int64
expiration_month         int64
membership_days          int64
song_year              float64
dtype: object


In [10]:
print ("Adding new features")

def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1
train['genre_ids'] = train['genre_ids'].cat.add_categories(['no_genre_id'])
train['genre_ids'].fillna('no_genre_id',inplace=True)
test['genre_ids'] = test['genre_ids'].cat.add_categories(['no_genre_id'])
test['genre_ids'].fillna('no_genre_id',inplace=True)
train['genre_ids_count'] = train['genre_ids'].apply(genre_id_count).astype(np.int8)
test['genre_ids_count'] = test['genre_ids'].apply(genre_id_count).astype(np.int8)

Adding new features


In [11]:

def lyricist_count(x):
    if x == 'no_lyricist':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    return sum(map(x.count, ['|', '/', '\\', ';']))

train['lyricist'] = train['lyricist'].cat.add_categories(['no_lyricist'])
train['lyricist'].fillna('no_lyricist',inplace=True)
test['lyricist'] = test['lyricist'].cat.add_categories(['no_lyricist'])
test['lyricist'].fillna('no_lyricist',inplace=True)
train['lyricists_count'] = train['lyricist'].apply(lyricist_count).astype(np.int8)
test['lyricists_count'] = test['lyricist'].apply(lyricist_count).astype(np.int8)

In [12]:
def composer_count(x):
    if x == 'no_composer':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
train['composer'] = train['composer'].cat.add_categories(['no_composer'])
test['composer'] = test['composer'].cat.add_categories(['no_composer'])
train['composer'].fillna('no_composer',inplace=True)
test['composer'].fillna('no_composer',inplace=True)
train['composer_count'] = train['composer'].apply(composer_count).astype(np.int8)
test['composer_count'] = test['composer'].apply(composer_count).astype(np.int8)

In [13]:
def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0

train['artist_name'] = train['artist_name'].cat.add_categories(['no_artist'])
test['artist_name'] = test['artist_name'].cat.add_categories(['no_artist'])
train['artist_name'].fillna('no_artist',inplace=True)
test['artist_name'].fillna('no_artist',inplace=True)
train['is_featured'] = train['artist_name'].apply(is_featured).astype(np.int8)
test['is_featured'] = test['artist_name'].apply(is_featured).astype(np.int8)

In [14]:
def artist_count(x):
    if x == 'no_artist':
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&')

train['artist_count'] = train['artist_name'].apply(artist_count).astype(np.int8)
test['artist_count'] = test['artist_name'].apply(artist_count).astype(np.int8)

In [17]:
train['artist_composer'] = (train['artist_name'] == train['composer'].astype(str)).astype(np.int8)
test['artist_composer'] = (test['artist_name'] == test['composer'].astype(str)).astype(np.int8)

In [18]:
# if artist, lyricist and composer are all three same
train['artist_composer_lyricist'] = ((train['artist_name'] == train['composer'].astype(str)) \
                                     & (train['artist_name'] == train['lyricist'].astype(str)) & \
                                     (train['composer'] == train['lyricist'].astype(str))).astype(np.int8)
test['artist_composer_lyricist'] = ((test['artist_name'] == test['composer'].astype(str)) \
                                    & (test['artist_name'] == test['lyricist'].astype(str)) & \
                                    (test['composer'] == test['lyricist'].astype(str))).astype(np.int8)

In [19]:
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0

train['song_lang_boolean'] = train['language'].apply(song_lang_boolean).astype(np.int8)
test['song_lang_boolean'] = test['language'].apply(song_lang_boolean).astype(np.int8)

In [20]:
_mean_song_length = np.mean(train['song_length'])
def smaller_song(x):
    if x < _mean_song_length:
        return 1
    return 0

train['smaller_song'] = train['song_length'].apply(smaller_song).astype(np.int8)
test['smaller_song'] = test['song_length'].apply(smaller_song).astype(np.int8)

In [21]:
# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0
    

train['count_song_played'] = train['song_id'].apply(count_song_played).astype(np.int64)
test['count_song_played'] = test['song_id'].apply(count_song_played).astype(np.int64)

In [22]:
# number of times the artist has been played
_dict_count_artist_played_train = {k: v for k, v in train['artist_name'].value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0

train['count_artist_played'] = train['artist_name'].apply(count_artist_played).astype(np.int64)
test['count_artist_played'] = test['artist_name'].apply(count_artist_played).astype(np.int64)


print "Done adding features"

Done adding features


In [27]:
import pickle
pickle.dump(train, open(data_path + 'train_feature_adding', 'wb'))

In [28]:
print 'done'

done


In [29]:
pickle.dump(test, open(data_path + 'test_feature_adding', 'wb'))
print 'done'

done
