In [2]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import gc

gc.enable()

params = {
    'task': 'train',
    'boosting_type': 'rf',
    'objective': 'binary',
    'metric': {'auc'},
    'num_leaves': 240,
    'min_data_in_leaf': 10,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0,
    'verbose': 0
}

In [3]:
import pandas as pd
import numpy as np

data = "/mnt/d/Data/mangaki-data-challenge/latest/"

In [15]:
def training(train, valid):
    X = lgb.Dataset(train.drop(['user_id', 'work_id', 'rating'], axis=1), train['rating'])
    V = lgb.Dataset(valid.drop(['user_id', 'work_id', 'rating'], axis=1), valid['rating'], reference=X)
    gbdt = lgb.train(params, X, valid_sets=[X,V], num_boost_round=200, early_stopping_rounds=20, verbose_eval=True)
    Yvp = gbdt.predict(valid.drop(['user_id', 'work_id', 'rating'], axis=1), num_iteration=gbdt.best_iteration)
    Ytp = gbdt.predict(train.drop(['user_id', 'work_id', 'rating'], axis=1), num_iteration=gbdt.best_iteration)
    return (roc_auc_score(train['rating'].values, Ytp), roc_auc_score(valid['rating'].values, Yvp))

In [4]:
def cv(param, paramlst):
    trainauc = [0.0]*len(paramlst)
    validauc = [0.0]*len(paramlst)
    for i, p in enumerate(paramlst):
        params[param]=p
        tv = [0,0,0]
        vv = [0,0,0]
        for fold in [1,2,3]:
            t = pd.read_csv(data+'train_{0}.csv'.format(str(fold)))
            v = pd.read_csv(data+'valid_{0}.csv'.format(str(fold)))
            #t['item_category'] = t.item_category.astype('category')
            #v['item_category'] = v.item_category.astype('category')
            tv[fold-1], vv[fold-1] = training(t, v)
        trainauc[i]=np.mean(tv)
        validauc[i]=np.mean(vv)
    paramtable = pd.DataFrame({
        'TrainingSet': trainauc,
        'ValidationSet': validauc
    }, columns=['TrainingSet', 'ValidationSet'], index=pd.Index(paramlst, name=param))
    print(paramtable)

In [16]:
cv('bagging_fraction', [0.8])

[1]	training's auc: 0.870701	valid_1's auc: 0.719176
Training until validation scores don't improve for 20 rounds.
[2]	training's auc: 0.916011	valid_1's auc: 0.753267
[3]	training's auc: 0.938291	valid_1's auc: 0.76771
[4]	training's auc: 0.951054	valid_1's auc: 0.779561
[5]	training's auc: 0.957918	valid_1's auc: 0.786861
[6]	training's auc: 0.960697	valid_1's auc: 0.793694
[7]	training's auc: 0.963753	valid_1's auc: 0.802363
[8]	training's auc: 0.964765	valid_1's auc: 0.80446
[9]	training's auc: 0.965855	valid_1's auc: 0.805597
[10]	training's auc: 0.967222	valid_1's auc: 0.807962
[11]	training's auc: 0.968076	valid_1's auc: 0.808422
[12]	training's auc: 0.968701	valid_1's auc: 0.81024
[13]	training's auc: 0.96957	valid_1's auc: 0.810445
[14]	training's auc: 0.969613	valid_1's auc: 0.810603
[15]	training's auc: 0.970426	valid_1's auc: 0.811045
[16]	training's auc: 0.97077	valid_1's auc: 0.812166
[17]	training's auc: 0.970839	valid_1's auc: 0.811681
[18]	training's auc: 0.971465	vali

[37]	training's auc: 0.976066	valid_1's auc: 0.820255
[38]	training's auc: 0.976116	valid_1's auc: 0.820566
[39]	training's auc: 0.976047	valid_1's auc: 0.820753
[40]	training's auc: 0.976176	valid_1's auc: 0.820974
[41]	training's auc: 0.976165	valid_1's auc: 0.821105
[42]	training's auc: 0.976252	valid_1's auc: 0.821381
[43]	training's auc: 0.976324	valid_1's auc: 0.821623
[44]	training's auc: 0.97642	valid_1's auc: 0.821969
[45]	training's auc: 0.976504	valid_1's auc: 0.822305
[46]	training's auc: 0.976558	valid_1's auc: 0.822378
[47]	training's auc: 0.976637	valid_1's auc: 0.822656
[48]	training's auc: 0.976804	valid_1's auc: 0.82287
[49]	training's auc: 0.976753	valid_1's auc: 0.823019
[50]	training's auc: 0.976783	valid_1's auc: 0.822896
[51]	training's auc: 0.976805	valid_1's auc: 0.823107
[52]	training's auc: 0.976737	valid_1's auc: 0.823504
[53]	training's auc: 0.976786	valid_1's auc: 0.823809
[54]	training's auc: 0.976928	valid_1's auc: 0.823807
[55]	training's auc: 0.976892	

[110]	training's auc: 0.977871	valid_1's auc: 0.826343
[111]	training's auc: 0.977871	valid_1's auc: 0.826409
[112]	training's auc: 0.977866	valid_1's auc: 0.826462
[113]	training's auc: 0.977876	valid_1's auc: 0.826534
[114]	training's auc: 0.977863	valid_1's auc: 0.82663
[115]	training's auc: 0.977878	valid_1's auc: 0.826645
[116]	training's auc: 0.977923	valid_1's auc: 0.826574
[117]	training's auc: 0.977921	valid_1's auc: 0.826548
[118]	training's auc: 0.977945	valid_1's auc: 0.826514
[119]	training's auc: 0.977902	valid_1's auc: 0.82667
[120]	training's auc: 0.977917	valid_1's auc: 0.826711
[121]	training's auc: 0.977903	valid_1's auc: 0.826594
[122]	training's auc: 0.977902	valid_1's auc: 0.826538
[123]	training's auc: 0.977924	valid_1's auc: 0.826604
[124]	training's auc: 0.977939	valid_1's auc: 0.826637
[125]	training's auc: 0.97792	valid_1's auc: 0.826761
[126]	training's auc: 0.977894	valid_1's auc: 0.826651
[127]	training's auc: 0.977885	valid_1's auc: 0.826793
[128]	trainin

Generate cv fold results for stacking

In [17]:
filename = 'rf_w2v_lda_lsi_'

In [18]:
def training_forstacking(train, valid):
    X = lgb.Dataset(train.drop(['user_id', 'work_id', 'rating'], axis=1), train['rating'])
    gbdt = lgb.train(params, X, num_boost_round=93)
    Yvp = gbdt.predict(valid.drop(['user_id', 'work_id', 'rating'], axis=1))
    return Yvp

def cv_forstacking():
    rst = []
    for fold in [1,2,3]:
        t = pd.read_csv(data+'train_{0}.csv'.format(str(fold)))
        v = pd.read_csv(data+'valid_{0}.csv'.format(str(fold)))
        #t['item_category'] = t.item_category.astype('category')
        #v['item_category'] = v.item_category.astype('category')
        pv = training_forstacking(t, v)
        f = v[['user_id', 'work_id', 'rating']].copy()
        f['rf_w2v_lda_lsi_prob'] = pv
        rst.append(f)
    pd.concat(rst).to_csv('/mnt/d/Data/mangaki-data-challenge/features/level1/'+filename+"train.csv", index=False)

In [19]:
cv_forstacking()

In [20]:
def testprob():
    t = pd.read_csv(data+'train_0.csv')
    v = pd.read_csv(data+'test_0.csv')
    #t['item_category'] = t.item_category.astype('category')
    #v['item_category'] = v.item_category.astype('category')
    X = lgb.Dataset(t.drop(['user_id', 'work_id', 'rating'], axis=1), t['rating'])
    gbdt = lgb.train(params, X, num_boost_round=93)
    Yvp = gbdt.predict(v.drop(['user_id', 'work_id'], axis=1))
    f = v[['user_id', 'work_id']].copy()
    f['rf_w2v_lda_lsi_prob'] = Yvp
    f.to_csv('/mnt/d/Data/mangaki-data-challenge/features/level1/'+filename+"test.csv", index=False)

In [21]:
testprob()

LibFM

In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_svmlight_file

path = "/mnt/d/Data/mangaki-data-challenge/libfm/"

In [25]:
params = {
    'n_iter':340000, 
    'init_stdev':0.01,
    'l2_reg_w':0.00025,
    'l2_reg_V':0.5,
    'rank':1,
    'step_size':0.02
}

In [13]:
from scipy.sparse import hstack
from fastFM import sgd

def train_fm(train, valid, trainy, validy):
    fm = sgd.FMClassification(**params)
    fm.fit(train, np.require(trainy*2-1, dtype=np.int))
    return (roc_auc_score(trainy, fm.predict_proba(train)), roc_auc_score(validy, fm.predict_proba(valid)))

def cv_fm(param, paramlst):
    trainauc = [0.0]*len(paramlst)
    validauc = [0.0]*len(paramlst)
    for i, p in enumerate(paramlst):
        params[param]=p
        tv = [0,0,0]
        vv = [0,0,0]
        for fold in [1,2,3]:
            train, ty = load_svmlight_file(path+"train_{0}.csv".format(fold))
            valid, vy = load_svmlight_file(path+"valid_{0}.csv".format(fold))
            ta = pd.read_csv(data+'train_{0}.csv'.format(str(fold))).fillna(0)
            va = pd.read_csv(data+'valid_{0}.csv'.format(str(fold))).fillna(0)
            tv[fold-1], vv[fold-1] = train_fm(train, valid, ty, vy)
        trainauc[i]=np.mean(tv)
        validauc[i]=np.mean(vv)
    paramtable = pd.DataFrame({
        'TrainingSet': trainauc,
        'ValidationSet': validauc
    }, columns=['TrainingSet', 'ValidationSet'], index=pd.Index(paramlst, name=param))
    print(paramtable)

In [26]:
cv_fm('l2_reg_V', [0.4, 0.5, 0.6])

          TrainingSet  ValidationSet
l2_reg_V                            
0.4          0.920071       0.806509
0.5          0.920067       0.806510
0.6          0.920065       0.806509


level1 stacking cv

In [102]:
X = []
Xv = []
Y = []
Yv = []
for cv in [1,2,3]:
    t = pd.read_csv(data+'train_{0}.csv'.format(str(cv)))
    v = pd.read_csv(data+'valid_{0}.csv'.format(str(cv)))
    X.append(t.ix[:, 3:].values)
    Y.append(t.ix[:, 2].values)
    Xv.append(v.ix[:, 3:].values)
    Yv.append(v.ix[:, 2].values)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.05, 0.1, 0.2, 0.4, 0.5, 0.6, 0.7]}

t = pd.read_csv(data+'train_0.csv')
X = t.ix[:, 3:].values
Y = t.ix[:, 2].values

lr = LogisticRegression(penalty='l1')
lrcv = GridSearchCV(lr, param_grid, scoring='roc_auc', cv=5)
lrcv.fit(X, Y)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.05, 0.1, 0.2, 0.4, 0.5, 0.6, 0.7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [113]:
%pdb

Automatic pdb calling has been turned ON


In [10]:
lrcv.grid_scores_



[mean: 0.84403, std: 0.00680, params: {'C': 0.05},
 mean: 0.84436, std: 0.00675, params: {'C': 0.1},
 mean: 0.84436, std: 0.00684, params: {'C': 0.2},
 mean: 0.84438, std: 0.00692, params: {'C': 0.4},
 mean: 0.84439, std: 0.00693, params: {'C': 0.5},
 mean: 0.84439, std: 0.00694, params: {'C': 0.6},
 mean: 0.84439, std: 0.00694, params: {'C': 0.7}]

In [125]:
v = pd.read_csv(data+'test_0.csv')



[mean: 0.84171, std: 0.00621, params: {'penalty': 'l1', 'C': 0.05},
 mean: 0.84035, std: 0.00622, params: {'penalty': 'l2', 'C': 0.05},
 mean: 0.84218, std: 0.00623, params: {'penalty': 'l1', 'C': 0.1},
 mean: 0.84149, std: 0.00624, params: {'penalty': 'l2', 'C': 0.1},
 mean: 0.84217, std: 0.00628, params: {'penalty': 'l1', 'C': 0.2},
 mean: 0.84195, std: 0.00632, params: {'penalty': 'l2', 'C': 0.2},
 mean: 0.84207, std: 0.00636, params: {'penalty': 'l1', 'C': 0.4},
 mean: 0.84214, std: 0.00640, params: {'penalty': 'l2', 'C': 0.4}]