In [40]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import gc

gc.enable()

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'auc'},
    'num_leaves': 40,
    'min_data_in_leaf': 90,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.96,
    'bagging_freq': 1,
    'lambda_l1': 0,
    'verbose': 0
}

#params = {
#    'task': 'train',
#    'boosting_type': 'rf',
#    'objective': 'binary',
#    'metric': {'auc'},
#    'num_leaves': 450,
#    'min_data_in_leaf': 8,
#    'feature_fraction': 0.65,
#    'bagging_fraction': 0.8,
#    'bagging_freq': 1,
#    'lambda_l1': 0,
#    'min_gain_to_split': 0,
#    'verbose': 0
#}

In [2]:
import pandas as pd
import numpy as np

data = "/mnt/d/Data/mangaki-data-challenge/latest/"

In [41]:
def training(train, valid):
    X = lgb.Dataset(train.drop(['user_id', 'work_id', 'rating'], axis=1), train['rating'])
    V = lgb.Dataset(valid.drop(['user_id', 'work_id', 'rating'], axis=1), valid['rating'], reference=X)
    gbdt = lgb.train(params, X, valid_sets=[X,V], num_boost_round=200, early_stopping_rounds=20, verbose_eval=True)
    Yvp = gbdt.predict(valid.drop(['user_id', 'work_id', 'rating'], axis=1), num_iteration=gbdt.best_iteration)
    Ytp = gbdt.predict(train.drop(['user_id', 'work_id', 'rating'], axis=1), num_iteration=gbdt.best_iteration)
    return (roc_auc_score(train['rating'].values, Ytp), roc_auc_score(valid['rating'].values, Yvp))

In [4]:
def cv(param, paramlst):
    trainauc = [0.0]*len(paramlst)
    validauc = [0.0]*len(paramlst)
    for i, p in enumerate(paramlst):
        params[param]=p
        tv = [0,0,0]
        vv = [0,0,0]
        for fold in [1,2,3]:
            t = pd.read_csv(data+'train_{0}.csv'.format(str(fold)))
            v = pd.read_csv(data+'valid_{0}.csv'.format(str(fold)))
            #t.drop('item_category', axis=1, inplace=True)
            #v.drop('item_category', axis=1, inplace=True)
            t['item_category'] = t.item_category.astype('category')
            v['item_category'] = v.item_category.astype('category')
            tv[fold-1], vv[fold-1] = training(t, v)
        trainauc[i]=np.mean(tv)
        validauc[i]=np.mean(vv)
    paramtable = pd.DataFrame({
        'TrainingSet': trainauc,
        'ValidationSet': validauc
    }, columns=['TrainingSet', 'ValidationSet'], index=pd.Index(paramlst, name=param))
    print(paramtable)

In [42]:
cv('bagging_fraction', [0.96])

[1]	training's auc: 0.777467	valid_1's auc: 0.732952
Training until validation scores don't improve for 20 rounds.
[2]	training's auc: 0.803539	valid_1's auc: 0.760022
[3]	training's auc: 0.816687	valid_1's auc: 0.764131
[4]	training's auc: 0.827921	valid_1's auc: 0.768678
[5]	training's auc: 0.835091	valid_1's auc: 0.775039
[6]	training's auc: 0.842301	valid_1's auc: 0.778743
[7]	training's auc: 0.845964	valid_1's auc: 0.782579
[8]	training's auc: 0.849573	valid_1's auc: 0.784146
[9]	training's auc: 0.854824	valid_1's auc: 0.787278
[10]	training's auc: 0.858942	valid_1's auc: 0.790071
[11]	training's auc: 0.861627	valid_1's auc: 0.791476
[12]	training's auc: 0.866929	valid_1's auc: 0.795238
[13]	training's auc: 0.869726	valid_1's auc: 0.797846
[14]	training's auc: 0.873382	valid_1's auc: 0.797635
[15]	training's auc: 0.876717	valid_1's auc: 0.798233
[16]	training's auc: 0.879926	valid_1's auc: 0.798472
[17]	training's auc: 0.883144	valid_1's auc: 0.799084
[18]	training's auc: 0.886556

[28]	training's auc: 0.912258	valid_1's auc: 0.815867
[29]	training's auc: 0.914527	valid_1's auc: 0.816239
[30]	training's auc: 0.916307	valid_1's auc: 0.816684
[31]	training's auc: 0.918285	valid_1's auc: 0.818916
[32]	training's auc: 0.919975	valid_1's auc: 0.819939
[33]	training's auc: 0.922011	valid_1's auc: 0.82143
[34]	training's auc: 0.923676	valid_1's auc: 0.822019
[35]	training's auc: 0.925377	valid_1's auc: 0.822603
[36]	training's auc: 0.926926	valid_1's auc: 0.823445
[37]	training's auc: 0.929107	valid_1's auc: 0.823939
[38]	training's auc: 0.930971	valid_1's auc: 0.824825
[39]	training's auc: 0.932764	valid_1's auc: 0.825621
[40]	training's auc: 0.934387	valid_1's auc: 0.825067
[41]	training's auc: 0.936272	valid_1's auc: 0.826173
[42]	training's auc: 0.93794	valid_1's auc: 0.825915
[43]	training's auc: 0.939454	valid_1's auc: 0.826597
[44]	training's auc: 0.941408	valid_1's auc: 0.827152
[45]	training's auc: 0.943238	valid_1's auc: 0.827387
[46]	training's auc: 0.944924	

[28]	training's auc: 0.913981	valid_1's auc: 0.822738
[29]	training's auc: 0.916194	valid_1's auc: 0.8239
[30]	training's auc: 0.918072	valid_1's auc: 0.824585
[31]	training's auc: 0.920819	valid_1's auc: 0.825605
[32]	training's auc: 0.922847	valid_1's auc: 0.826776
[33]	training's auc: 0.924905	valid_1's auc: 0.826911
[34]	training's auc: 0.926465	valid_1's auc: 0.828425
[35]	training's auc: 0.928885	valid_1's auc: 0.828792
[36]	training's auc: 0.930658	valid_1's auc: 0.828779
[37]	training's auc: 0.932842	valid_1's auc: 0.829464
[38]	training's auc: 0.934601	valid_1's auc: 0.830127
[39]	training's auc: 0.93595	valid_1's auc: 0.830527
[40]	training's auc: 0.937928	valid_1's auc: 0.830748
[41]	training's auc: 0.9396	valid_1's auc: 0.831177
[42]	training's auc: 0.941305	valid_1's auc: 0.831954
[43]	training's auc: 0.943038	valid_1's auc: 0.832215
[44]	training's auc: 0.944584	valid_1's auc: 0.833336
[45]	training's auc: 0.94611	valid_1's auc: 0.833896
[46]	training's auc: 0.947333	vali

LibFM

In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_svmlight_file

path = "/mnt/d/Data/mangaki-data-challenge/libfm/"

In [25]:
params = {
    'n_iter':340000, 
    'init_stdev':0.01,
    'l2_reg_w':0.00025,
    'l2_reg_V':0.5,
    'rank':1,
    'step_size':0.02
}

In [13]:
from scipy.sparse import hstack
from fastFM import sgd

def train_fm(train, valid, trainy, validy):
    fm = sgd.FMClassification(**params)
    fm.fit(train, np.require(trainy*2-1, dtype=np.int))
    return (roc_auc_score(trainy, fm.predict_proba(train)), roc_auc_score(validy, fm.predict_proba(valid)))

def cv_fm(param, paramlst):
    trainauc = [0.0]*len(paramlst)
    validauc = [0.0]*len(paramlst)
    for i, p in enumerate(paramlst):
        params[param]=p
        tv = [0,0,0]
        vv = [0,0,0]
        for fold in [1,2,3]:
            train, ty = load_svmlight_file(path+"train_{0}.csv".format(fold))
            valid, vy = load_svmlight_file(path+"valid_{0}.csv".format(fold))
            ta = pd.read_csv(data+'train_{0}.csv'.format(str(fold))).fillna(0)
            va = pd.read_csv(data+'valid_{0}.csv'.format(str(fold))).fillna(0)
            tv[fold-1], vv[fold-1] = train_fm(train, valid, ty, vy)
        trainauc[i]=np.mean(tv)
        validauc[i]=np.mean(vv)
    paramtable = pd.DataFrame({
        'TrainingSet': trainauc,
        'ValidationSet': validauc
    }, columns=['TrainingSet', 'ValidationSet'], index=pd.Index(paramlst, name=param))
    print(paramtable)

In [26]:
cv_fm('l2_reg_V', [0.4, 0.5, 0.6])

          TrainingSet  ValidationSet
l2_reg_V                            
0.4          0.920071       0.806509
0.5          0.920067       0.806510
0.6          0.920065       0.806509
