In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

%matplotlib inline

## Importing

In [None]:
print("Loading...")

data_path ='..\\input\\'
train = pd.read_csv(data_path + 'train.csv', encoding='utf-8', dtype = {'target': np.int32})
test = pd.read_csv(data_path + 'test.csv' ,  encoding='utf-8')
songs = pd.read_csv(data_path + 'songs.csv', encoding='utf-8')
members = pd.read_csv(data_path + 'members.csv', encoding='utf-8', parse_dates = ['expiration_date', 'registration_init_time'])
songs_extra = pd.read_csv(data_path + 'song_extra_info.csv', encoding='utf-8')

# generate features from isrc, see https://www.dittomusic.com/blog/what-is-an-isrc-code
def isrc_to_country(isrc):
    if type(isrc) == str:
        return isrc[:2]
    else:
        return np.nan
    
def isrc_to_label(isrc):
    if type(isrc) == str:
        return isrc[2:5]
    else:
        return np.nan

def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra['song_country'] = songs_extra['isrc'].apply(isrc_to_country)
songs_extra['record_label'] = songs_extra['isrc'].apply(isrc_to_label)
songs_extra.drop(['isrc','name'], axis = 1, inplace =True)


In [None]:
print("Appending...")
df = pd.concat([train, test])
df.drop('id', axis=1, inplace=True)

print("Merging...")
df = pd.merge(df, songs, on = 'song_id', how = 'left')
df = pd.merge(df, members, on = "msno", how = 'left')
df = pd.merge(df, songs_extra, on = "song_id", how = 'left')


## Preprocessing

In [None]:
# handing extreme values
df.loc[(df['bd']<=0)|(df['bd']>70), 'bd'] = np.nan

for col in ['composer','lyricist','artist_name','genre_ids']:
    # change all to upper case, remove blanks, replace delimiter with space afterwards 
    df[col]=df[col].str.upper().str.replace(" ","")
    df[col]=df[col].str.replace("|", " ").str.replace("\\", " ").str.replace('>'," ") \
    .str.replace("/", " ").str.replace('+',' ').str.replace('&',' ').str.replace('、',' ').str.replace('\\\\'," ")
    # count number of new entity for each col
    df[str(col)+"_nb"] = df[col].str.count(" ")+1
    df.loc[df[str(col)+"_nb"].isnull(), str(col)+"_nb"]=0
         
print('finished counting')

# generate new features before label get encoded
df['artist_composer'] = 0
df.loc[df.artist_name==df.composer, 'artist_composer'] = 1
df['composer_lyricist'] = 0
df.loc[df.lyricist==df.composer, 'composer_lyricist'] = 1
df['three_in_one'] = 0
df.loc[(df.artist_name==df.composer)&(df.composer==df.lyricist), 'three_in_one'] = 1

add_features=['three_in_one','artist_composer','composer_lyricist',
              'composer_nb','lyricist_nb','genre_ids_nb','artist_name_nb']

# Memory reduction: 
for col in add_features:
    df[col]=df[col].astype('int8')

In [None]:
print('Handle missing... Category to number')

enc = LabelEncoder()

# for categorical vars saved as string:
for col in ['msno', 'song_id', 'source_screen_name', 
            'source_system_tab', 'source_type', 'genre_ids',
            'artist_name', 'composer', 'lyricist',  'gender',
            'record_label', 'song_country']:
    df[col] = enc.fit_transform(df[col].fillna('nan'))
    
# for categorical vars saved as int:           
for col in ['city', 'language', 'registered_via']:
    df[col] = enc.fit_transform(df[col].fillna(-5))

# for actual numeric value:
df['song_length'].fillna(df['song_length'].median(), inplace = True)
df['bd'].fillna(df['bd'].median(), inplace = True)
df['song_year'].fillna(method ='ffill', inplace = True) # fill with nearby year

# for dates:
for col in ['expiration_date', 'registration_init_time']:
    df[col] = df[col].apply(lambda x: x.toordinal())

In [None]:
df['time'] = df.index / len(df)

# Memory reduction: 
long_col = ['source_screen_name','source_system_tab','source_type',
            'bd','language','city','gender','registered_via','song_country' ]
for col in long_col:
    df[col]=df[col].astype('int8')

longer_col = ['record_label','genre_ids','song_year']
for col in longer_col:
    df[col]=df[col].astype('int16')
    
train_rows = len(train)

train_data = df[:train_rows]
test_data = df[train_rows:]

print ('save data to local')
train_data.to_hdf('../data/train_data.hdf', key='wsdm')
test_data.to_hdf('../data/test_data.hdf', key='wsdm')
df.to_hdf('../data/df_all.hdf', key='wsdm')

## Benchmarking

In [None]:
print('import from local files')

train_data = pd.read_hdf('../data/train_data.hdf')
test_data = pd.read_hdf('../data/test_data.hdf')
#df_all =pd.read_hdf('../data/df_all.hdf')

In [None]:
# train test split
X = train_data.drop(['target'], axis=1)
y = train_data.loc[:,'target']
X_sub = test_data.drop(['target'], axis=1)

clf = LogisticRegression()
clf.fit(X, y)
pred_y_sub = clf.predict_proba(X_sub)
subm = pd.DataFrame(pred_y_sub[:,1], columns=['target'])
subm['id'] = subm.index
subm.to_csv('../output/benchmark.csv.gz', compression='gzip', index=False, float_format='%.5f')

print('benchmarking done!')

## Feature Engineering

In [None]:
recent = len(test_data) + int(0.05*len(train_data))

df_trains = train_data[-recent:]
df_history_trains = train_data[:-recent]
df_trains.target.to_hdf('../data/ytrain.hdf', key='base')
df_all = pd.concat([train_data, test_data])

In [None]:
add_features=['three_in_one','artist_composer','composer_lyricist',
              'composer_nb','lyricist_nb','genre_ids_nb','artist_name_nb',
              'song_year','song_country','record_label','id']

not_categorical_columns = ['target', 'song_length', 'registration_init_time', 'expiration_date', 'time', 'bd']+add_features

categorical_columns = ['artist_name', 'city', 'composer', 'gender', 'genre_ids', 'language',
       'lyricist', 'msno', 'registered_via', 'song_id', 'source_screen_name', 'source_system_tab', 'source_type']

orders = {}

for col in categorical_columns:
    orders[col] = 10 ** (int(np.log(df_all[col].max() + 1) / np.log(10)) + 1)

In [None]:
def get_group(df, cols):
    
    group = df[cols[0]].copy()
    for col in cols[1:]:
        group = group * orders[col] + df[col] # the combination of n for n in cols
        
    return group

def mean(df_history, df, cols):
    
    group = get_group(df, cols)
    group_history = get_group(df_history, cols)
    mean_map = df_history.groupby(group_history).target.mean()
    
    return group.map(mean_map).fillna(-1)


def count(df_history, df, cols):
    
    group = get_group(df, cols)
    group_history = get_group(df_history, cols) 
    count_map = group_history.value_counts()
    
    return group.map(count_map).fillna(0)


def time_to_next_heard(df_history, df, cols):
    
    result = []
    df_reverse = df.sort_index(ascending=False)
    group = get_group(df_reverse,  cols)
    
    next_heard = {}
    for g, t in zip(group, df_reverse.time):
        if g in next_heard:
            result.append(t - next_heard[g])
        else:
            result.append(-1)
        next_heard[g] = t
    
    result.reverse()
    return result

def col_name(cols, func):
    return '_'.join(cols) + '_' + func.__name__



In [None]:
def create_features(df_history, df):
    
    X = pd.DataFrame()
    
    for num_col in [1, 2]:
        for cols in combinations(categorical_columns, num_col):
            for func in [mean, time_to_next_heard]:
                X[col_name(cols, func)] = func(df_history, df, list(cols))
                X[col_name(cols, func)] = X[col_name(cols, func)].astype(np.float32)
            for func in [count]:
                X[col_name(cols, func)] = func(df_history, df, list(cols))
                X[col_name(cols, func)] = X[col_name(cols, func)].astype(np.int16)

    keep_list= ['song_length', 'bd', 'expiration_date', 'registration_init_time', 
                'three_in_one','artist_composer','composer_lyricist',
                'composer_nb','lyricist_nb','genre_ids_nb','artist_name_nb',
                'song_year','song_country','record_label']
    
    for col in keep_list:
        X[col] = df[col]


    return X

In [None]:
Xtrain = create_features(df_history_trains, df_trains)
Xtrain.to_hdf('../data/Xtrain.hdf', key='base')

In [None]:
Xtest = create_features(train_data, test_data)
Xtest.to_hdf('../data/Xtest.hdf', key='base')

## Fit Models

In [None]:
# Train with only a sample to find which algorithm and paramters to use.
Xtrain = pd.read_hdf('../data/Xtrain.hdf', key='base')[-1500000:-500000]
ytrain = pd.read_hdf('../data/ytrain.hdf', key='base')[-1500000:-500000]
Xval = pd.read_hdf('../data/Xtrain.hdf', key='base')[-500000:]
yval = pd.read_hdf('../data/ytrain.hdf', key='base')[-500000:]

In [None]:
# train LightGBM with default setting
d_train = lgb.Dataset(Xtrain, ytrain)
val_set = [lgb.Dataset(Xval, yval)]

params = {  'objective': 'binary',
            'boosting': 'gbdt',
            'metric' : 'auc'}

print('Start training using default paramters...')
default_lgb = lgb.train(params, train_set=d_train, valid_sets=val_set, verbose_eval=20)
#[100]	valid_0's auc: 0.715786
#[100]	valid_0's auc: 0.694743

In [None]:
# train LightGBM with adjusted parameters
params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'metric' : 'auc',
        'learning_rate': 0.03,
        'num_leaves': 2**6,
        'bagging_fraction': 0.9,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.8,
        'feature_fraction_seed': 1,
        'max_depth': 10
    }

print('Start training using adjusted paramters...')
tuned = lgb.train(params, train_set=d_train, valid_sets=val_set, num_boost_round=500, verbose_eval=20) #0.722774


In [None]:
yval_preds = tuned.predict(Xval)

In [None]:
print('Plot feature importances...')
ax = lgb.plot_importance(tuned, max_num_features=15, figsize=(10, 8), importance_type='gain')
plt.show()

#### Compared with other models

In [None]:
tscv = TimeSeriesSplit(n_splits=2)

In [None]:
# Random Forest
tree_para = {"max_features":[10, 50],
             "min_samples_leaf": [10, 100]
              }
clf = RandomForestClassifier()
tree_cv = GridSearchCV(clf, tree_para, cv = tscv, scoring='roc_auc')
tree_cv.fit(Xtrain, ytrain)

print("*********Random Forest Results*********")
print("best params are :", tree_cv.best_params_)
print("best score is :", tree_cv.best_score_)


In [None]:
y_tree = tree_cv.predict_proba(Xval)[:,1]
np.corrcoef(y_tree, yval_preds)

In [None]:
## AdaBoost Forest (very slow slow)
# ada_para = {"n_estimators":[200, 300], 
#             #"learning_rate": [0.1, 0.5] 
#               }

# clf = AdaBoostClassifier() #, cv=cv
# ada_cv = GridSearchCV(clf, ada_para, cv = tscv, scoring='roc_auc')
# ada_cv.fit(Xtrain[-10000:], ytrain[-10000:]) # due to the long training time, set the input data to only last 10,000

# print("*********AdaBoostClassifier Results*********")
# print("best params are :", ada_cv.best_params_)
# print("best score is :", ada_cv.best_score_)


### Fit using the final train dataset

In [None]:
# Train with whole dataset
Xtrain = pd.read_hdf('../data/Xtrain_original.hdf')
ytrain = pd.read_hdf('../data/ytrain.hdf', key='base')

In [None]:
d_train = lgb.Dataset(Xtrain, ytrain)
val_set = [d_train]
params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'learning_rate': 0.03,
        'num_leaves': 2**6,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_depth': 10,
        'metric' : 'auc'
    }

print('Start training...')   
model = lgb.train(params, train_set=d_train, num_boost_round=50, valid_sets=val_set, verbose_eval=10)
model.save_model('../model_final.csv')

In [None]:
# load model to predict
print('Load model')
bst = lgb.Booster(model_file='../model_final.csv')
print('Plot feature importances...')
ax = lgb.plot_importance(bst, importance_type= 'gain', max_num_features=15, figsize=(10, 8))
plt.show()

In [None]:
print('To predict') 
Xtest= pd.read_hdf('../data/Xtest_original.hdf')
y_pred = bst.predict(Xtest)

result_df = pd.DataFrame()
result_df['target'] = y_pred
result_df['id'] = result_df.index
print('Save prediction')                                                 
result_df.to_csv('../output/submission.csv.gz', compression = 'gzip', index=False, float_format = '%.5f')