In [1]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import gc

gc.enable()

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'auc'},
    'num_leaves': 40,
    'min_data_in_leaf': 90,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.96,
    'bagging_freq': 1,
    'lambda_l1': 0,
    'verbose': 0
}

#params = {
#    'task': 'train',
#    'boosting_type': 'rf',
#    'objective': 'binary',
#    'metric': {'auc'},
#    'num_leaves': 550,
#    'min_data_in_leaf': 6,
#    'feature_fraction': 0.55,
#    'bagging_fraction': 0.8,
#    'bagging_freq': 1,
#    'lambda_l1': 0,
#    'min_gain_to_split': 0,
#    'verbose': 0
#}

In [2]:
import pandas as pd
import numpy as np

data = "/mnt/d/Data/mangaki-data-challenge/baseline/level1/"

In [18]:
# level1 param

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'auc'},
    'num_leaves': 2,
    'min_data_in_leaf': 400,
    'feature_fraction': 1,
    'bagging_fraction': 0.91,
    'bagging_freq': 1,
    'verbose': 0
}

In [19]:
def training(train, valid):
    X = lgb.Dataset(train.drop(['user_id', 'work_id', 'rating'], axis=1), train['rating'])
    V = lgb.Dataset(valid.drop(['user_id', 'work_id', 'rating'], axis=1), valid['rating'], reference=X)
    gbdt = lgb.train(params, X, valid_sets=[X,V], num_boost_round=200, early_stopping_rounds=20, verbose_eval=True)
    Yvp = gbdt.predict(valid.drop(['user_id', 'work_id', 'rating'], axis=1), num_iteration=gbdt.best_iteration)
    Ytp = gbdt.predict(train.drop(['user_id', 'work_id', 'rating'], axis=1), num_iteration=gbdt.best_iteration)
    return (roc_auc_score(train['rating'].values, Ytp), roc_auc_score(valid['rating'].values, Yvp))

In [5]:
def cv(param, paramlst):
    trainauc = [0.0]*len(paramlst)
    validauc = [0.0]*len(paramlst)
    for i, p in enumerate(paramlst):
        params[param]=p
        tv = [0,0,0]
        vv = [0,0,0]
        for fold in [1,2,3]:
            t = pd.read_csv(data+'train_{0}.csv'.format(str(fold)))
            v = pd.read_csv(data+'valid_{0}.csv'.format(str(fold)))
            #t['item_category'] = t.item_category.astype('category')
            #v['item_category'] = v.item_category.astype('category')
            tv[fold-1], vv[fold-1] = training(t, v)
        trainauc[i]=np.mean(tv)
        validauc[i]=np.mean(vv)
    paramtable = pd.DataFrame({
        'TrainingSet': trainauc,
        'ValidationSet': validauc
    }, columns=['TrainingSet', 'ValidationSet'], index=pd.Index(paramlst, name=param))
    print(paramtable)

In [20]:
cv('bagging_fraction', [0.91])

[1]	training's auc: 0.755659	valid_1's auc: 0.760444
Training until validation scores don't improve for 20 rounds.
[2]	training's auc: 0.77528	valid_1's auc: 0.780192
[3]	training's auc: 0.801813	valid_1's auc: 0.80251
[4]	training's auc: 0.811368	valid_1's auc: 0.808805
[5]	training's auc: 0.816753	valid_1's auc: 0.815829
[6]	training's auc: 0.817218	valid_1's auc: 0.816259
[7]	training's auc: 0.817199	valid_1's auc: 0.816203
[8]	training's auc: 0.817401	valid_1's auc: 0.816342
[9]	training's auc: 0.824505	valid_1's auc: 0.821173
[10]	training's auc: 0.827289	valid_1's auc: 0.82535
[11]	training's auc: 0.833293	valid_1's auc: 0.83003
[12]	training's auc: 0.833537	valid_1's auc: 0.830444
[13]	training's auc: 0.833961	valid_1's auc: 0.830414
[14]	training's auc: 0.834486	valid_1's auc: 0.830533
[15]	training's auc: 0.836468	valid_1's auc: 0.831551
[16]	training's auc: 0.836487	valid_1's auc: 0.831425
[17]	training's auc: 0.836536	valid_1's auc: 0.83142
[18]	training's auc: 0.836544	vali

[1]	training's auc: 0.754786	valid_1's auc: 0.747752
Training until validation scores don't improve for 20 rounds.
[2]	training's auc: 0.782702	valid_1's auc: 0.778012
[3]	training's auc: 0.798866	valid_1's auc: 0.797507
[4]	training's auc: 0.799586	valid_1's auc: 0.797876
[5]	training's auc: 0.804732	valid_1's auc: 0.803122
[6]	training's auc: 0.811711	valid_1's auc: 0.812291
[7]	training's auc: 0.81553	valid_1's auc: 0.816608
[8]	training's auc: 0.820614	valid_1's auc: 0.822901
[9]	training's auc: 0.822497	valid_1's auc: 0.824785
[10]	training's auc: 0.822316	valid_1's auc: 0.824974
[11]	training's auc: 0.824297	valid_1's auc: 0.826624
[12]	training's auc: 0.825228	valid_1's auc: 0.828196
[13]	training's auc: 0.826868	valid_1's auc: 0.829477
[14]	training's auc: 0.828172	valid_1's auc: 0.829365
[15]	training's auc: 0.835134	valid_1's auc: 0.834464
[16]	training's auc: 0.835872	valid_1's auc: 0.834918
[17]	training's auc: 0.837313	valid_1's auc: 0.837275
[18]	training's auc: 0.838204	

LibFM

In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_svmlight_file

path = "/mnt/d/Data/mangaki-data-challenge/libfm/"

In [25]:
params = {
    'n_iter':340000, 
    'init_stdev':0.01,
    'l2_reg_w':0.00025,
    'l2_reg_V':0.5,
    'rank':1,
    'step_size':0.02
}

In [13]:
from scipy.sparse import hstack
from fastFM import sgd

def train_fm(train, valid, trainy, validy):
    fm = sgd.FMClassification(**params)
    fm.fit(train, np.require(trainy*2-1, dtype=np.int))
    return (roc_auc_score(trainy, fm.predict_proba(train)), roc_auc_score(validy, fm.predict_proba(valid)))

def cv_fm(param, paramlst):
    trainauc = [0.0]*len(paramlst)
    validauc = [0.0]*len(paramlst)
    for i, p in enumerate(paramlst):
        params[param]=p
        tv = [0,0,0]
        vv = [0,0,0]
        for fold in [1,2,3]:
            train, ty = load_svmlight_file(path+"train_{0}.csv".format(fold))
            valid, vy = load_svmlight_file(path+"valid_{0}.csv".format(fold))
            ta = pd.read_csv(data+'train_{0}.csv'.format(str(fold))).fillna(0)
            va = pd.read_csv(data+'valid_{0}.csv'.format(str(fold))).fillna(0)
            tv[fold-1], vv[fold-1] = train_fm(train, valid, ty, vy)
        trainauc[i]=np.mean(tv)
        validauc[i]=np.mean(vv)
    paramtable = pd.DataFrame({
        'TrainingSet': trainauc,
        'ValidationSet': validauc
    }, columns=['TrainingSet', 'ValidationSet'], index=pd.Index(paramlst, name=param))
    print(paramtable)

In [26]:
cv_fm('l2_reg_V', [0.4, 0.5, 0.6])

          TrainingSet  ValidationSet
l2_reg_V                            
0.4          0.920071       0.806509
0.5          0.920067       0.806510
0.6          0.920065       0.806509
