In [21]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import gc

gc.enable()

#params = {
#    'task': 'train',
#    'boosting_type': 'gbdt',
#    'objective': 'binary',
#    'metric': {'auc'},
#    'num_leaves': 60,
#    'min_data_in_leaf': 60,
#    'feature_fraction': 0.6,
#    'bagging_fraction': 0.96,
#    'bagging_freq': 1,
#    'lambda_l1': 0,
#    'verbose': 0
#}

params = {
    'task': 'train',
    'boosting_type': 'rf',
    'objective': 'binary',
    'metric': {'auc'},
    'num_leaves': 450,
    'min_data_in_leaf': 8,
    'feature_fraction': 0.65,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0,
    'min_gain_to_split': 0,
    'verbose': 0
}

In [1]:
import pandas as pd
import numpy as np

data = "/mnt/d/Data/mangaki-data-challenge/latest/"

In [5]:
def training(train, valid):
    X = lgb.Dataset(train.drop(['user_id', 'work_id', 'rating'], axis=1), train['rating'])
    V = lgb.Dataset(valid.drop(['user_id', 'work_id', 'rating'], axis=1), valid['rating'], reference=X)
    gbdt = lgb.train(params, X, valid_sets=[X,V], num_boost_round=200, early_stopping_rounds=20, verbose_eval=False)
    Yvp = gbdt.predict(valid.drop(['user_id', 'work_id', 'rating'], axis=1), num_iteration=gbdt.best_iteration)
    Ytp = gbdt.predict(train.drop(['user_id', 'work_id', 'rating'], axis=1), num_iteration=gbdt.best_iteration)
    return (roc_auc_score(train['rating'].values, Ytp), roc_auc_score(valid['rating'].values, Yvp))

In [6]:
def cv(param, paramlst):
    trainauc = [0.0]*len(paramlst)
    validauc = [0.0]*len(paramlst)
    for i, p in enumerate(paramlst):
        params[param]=p
        tv = [0,0,0]
        vv = [0,0,0]
        for fold in [1,2,3]:
            t = pd.read_csv(data+'train_{0}.csv'.format(str(fold)))
            v = pd.read_csv(data+'valid_{0}.csv'.format(str(fold)))
            tv[fold-1], vv[fold-1] = training(t, v)
        trainauc[i]=np.mean(tv)
        validauc[i]=np.mean(vv)
    paramtable = pd.DataFrame({
        'TrainingSet': trainauc,
        'ValidationSet': validauc
    }, columns=['TrainingSet', 'ValidationSet'], index=pd.Index(paramlst, name=param))
    print(paramtable)

In [22]:
cv('bagging_fraction', [0.7, 0.75, 0.8, 0.85, 0.9])

                  TrainingSet  ValidationSet
bagging_fraction                            
0.70                 0.992383       0.814797
0.75                 0.994498       0.813006
0.80                 0.996019       0.816506
0.85                 0.996705       0.814585
0.90                 0.997960       0.815494


LibFM

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_svmlight_file

path = "/mnt/d/Data/mangaki-data-challenge/libfm/"

In [35]:
params = {
    'n_iter':300000, 
    'init_stdev':0.01,
    'l2_reg_w':0.0001,
    'l2_reg_V':0.5,
    'rank':1,
    'step_size':0.02
}

In [4]:
from scipy.sparse import hstack
from fastFM import sgd

def train_fm(train, valid, trainy, validy):
    fm = sgd.FMClassification(**params)
    fm.fit(train, np.require(trainy*2-1, dtype=np.int))
    return (roc_auc_score(trainy, fm.predict_proba(train)), roc_auc_score(validy, fm.predict_proba(valid)))

def cv_fm(param, paramlst):
    trainauc = [0.0]*len(paramlst)
    validauc = [0.0]*len(paramlst)
    for i, p in enumerate(paramlst):
        params[param]=p
        tv = [0,0,0]
        vv = [0,0,0]
        for fold in [1,2,3]:
            train, ty = load_svmlight_file(path+"train_{0}.csv".format(fold))
            valid, vy = load_svmlight_file(path+"valid_{0}.csv".format(fold))
            ta = pd.read_csv(data+'train_{0}.csv'.format(str(fold))).fillna(0)
            va = pd.read_csv(data+'valid_{0}.csv'.format(str(fold))).fillna(0)
            tv[fold-1], vv[fold-1] = train_fm(train, valid, ty, vy)
        trainauc[i]=np.mean(tv)
        validauc[i]=np.mean(vv)
    paramtable = pd.DataFrame({
        'TrainingSet': trainauc,
        'ValidationSet': validauc
    }, columns=['TrainingSet', 'ValidationSet'], index=pd.Index(paramlst, name=param))
    print(paramtable)

In [36]:
cv_fm('rank', [1,2])

      TrainingSet  ValidationSet
rank                            
1        0.914027       0.805387
2        0.914027       0.805387
