In [1]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

data = "/mnt/d/Data/mangaki-data-challenge/latest/"
import gc
gc.enable()

In [2]:
featurepath = data+"../features/level1/"

In [24]:
# for random forest

params = {
    'task': 'train',
    'boosting_type': 'rf',
    'objective': 'binary',
    'num_leaves': 550,
    'min_data_in_leaf': 6,
    'feature_fraction': 0.55,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0,
    'min_gain_to_split': 0,
    'verbose': 0
}

def training(train, valid):
    X = lgb.Dataset(train.drop(['user_id', 'work_id', 'rating'], axis=1), train['rating'])
    gbdt = lgb.train(params, X, num_boost_round=116)
    Yvp = gbdt.predict(valid.drop(['user_id', 'work_id', 'rating'], axis=1))
    return Yvp

def cv():
    rst = []
    for fold in [1,2,3]:
        t = pd.read_csv(data+'train_{0}.csv'.format(str(fold)))
        v = pd.read_csv(data+'valid_{0}.csv'.format(str(fold)))
        t['item_category'] = t.item_category.astype('category')
        v['item_category'] = v.item_category.astype('category')
        pv = training(t, v)
        f = v[['user_id', 'work_id', 'rating']].copy()
        f['rf_prob'] = pv
        rst.append(f)
    pd.concat(rst).to_csv(featurepath+'rf_train.csv', index=False)
    
def testprob():
    t = pd.read_csv(data+'train_0.csv')
    v = pd.read_csv(data+'test_0.csv')
    t['item_category'] = t.item_category.astype('category')
    v['item_category'] = v.item_category.astype('category')
    X = lgb.Dataset(t.drop(['user_id', 'work_id', 'rating'], axis=1), t['rating'])
    gbdt = lgb.train(params, X, num_boost_round=116)
    Yvp = gbdt.predict(v.drop(['user_id', 'work_id'], axis=1))
    f = v[['user_id', 'work_id']].copy()
    f['rf_prob'] = Yvp
    f.to_csv(featurepath+'rf_test.csv', index=False)

In [26]:
# for gbdt

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'num_leaves': 40,
    'min_data_in_leaf': 90,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.96,
    'bagging_freq': 1,
    'lambda_l1': 0,
    'verbose': 0
}

def training(train, valid):
    X = lgb.Dataset(train.drop(['user_id', 'work_id', 'rating'], axis=1), train['rating'])
    gbdt = lgb.train(params, X, num_boost_round=119)
    Yvp = gbdt.predict(valid.drop(['user_id', 'work_id', 'rating'], axis=1))
    return Yvp

def cv():
    rst = []
    for fold in [1,2,3]:
        t = pd.read_csv(data+'train_{0}.csv'.format(str(fold)))
        v = pd.read_csv(data+'valid_{0}.csv'.format(str(fold)))
        t['item_category'] = t.item_category.astype('category')
        v['item_category'] = v.item_category.astype('category')
        pv = training(t, v)
        f = v[['user_id', 'work_id', 'rating']].copy()
        f['gbdt_prob'] = pv
        rst.append(f)
    pd.concat(rst).to_csv(featurepath+'gbdt_train.csv', index=False)
    
def testprob():
    t = pd.read_csv(data+'train_0.csv')
    v = pd.read_csv(data+'test_0.csv')
    t['item_category'] = t.item_category.astype('category')
    v['item_category'] = v.item_category.astype('category')
    X = lgb.Dataset(t.drop(['user_id', 'work_id', 'rating'], axis=1), t['rating'])
    gbdt = lgb.train(params, X, num_boost_round=116)
    Yvp = gbdt.predict(v.drop(['user_id', 'work_id'], axis=1))
    f = v[['user_id', 'work_id']].copy()
    f['gbdt_prob'] = Yvp
    f.to_csv(featurepath+'gbdt_test.csv', index=False)

In [34]:
# for fm
from scipy.sparse import hstack
from fastFM import sgd
from sklearn.datasets import load_svmlight_file

libfmpath = "/mnt/d/Data/mangaki-data-challenge/libfm/"

params = {
    'n_iter':340000, 
    'init_stdev':0.01,
    'l2_reg_w':0.00025,
    'l2_reg_V':0.5,
    'rank':1,
    'step_size':0.02
}

def train_fm(train, valid, trainy):
    fm = sgd.FMClassification(**params)
    fm.fit(train, np.require(trainy*2-1, dtype=np.int))
    return fm.predict_proba(valid)

def cv():
    rst = []
    for fold in [1,2,3]:
        train, ty = load_svmlight_file(libfmpath+"train_{0}.csv".format(fold))
        valid, _ = load_svmlight_file(libfmpath+"valid_{0}.csv".format(fold))
        pv = train_fm(train, valid, ty)
        v = pd.read_csv(data+'valid_{0}.csv'.format(str(fold)))
        f = v[['user_id', 'work_id', 'rating']].copy()
        f['fm_prob'] = pv
        rst.append(f)
    pd.concat(rst).to_csv(featurepath+'fm_train.csv', index=False)
    
def testprob():
    v = pd.read_csv(data+'test_0.csv')
    train, ty = load_svmlight_file(libfmpath+"train_0.csv", zero_based=True)
    test, _ = load_svmlight_file(libfmpath+"test_0.csv")
    fm = sgd.FMClassification(**params)
    fm.fit(train, np.require(ty*2-1, dtype=np.int))
    Y = fm.predict_proba(test)
    f = v[['user_id', 'work_id']].copy()
    f['fm_prob'] = Y
    f.to_csv(featurepath+'fm_test.csv', index=False)

In [35]:
cv()
testprob()

In [38]:
s1 = pd.read_csv(data+"../features/level1/gbdt_train.csv", usecols = ['user_id','work_id','gbdt_prob'])
s2 = pd.read_csv(data+"../features/level1/rf_train.csv", usecols = ['user_id','work_id','rf_prob'])
s3 = pd.read_csv(data+"../features/level1/fm_train.csv", usecols = ['user_id','work_id','fm_prob'])

for i in [1, 2, 3]:
    train = pd.read_csv(data+"../baseline/train_{0}.csv".format(i), usecols = ['user_id','work_id','rating'])
    if i != 0:
        valid = pd.read_csv(data+"../baseline/valid_{0}.csv".format(i), usecols = ['user_id','work_id','rating'])
    else:
        valid = pd.read_csv(data+"../baseline/test_{0}.csv".format(i), usecols = ['user_id','work_id'])

    train = train.merge(s1, on=['user_id', 'work_id'], how='left').\
                  merge(s2, on=['user_id', 'work_id'], how='left').\
                  merge(s3, on=['user_id', 'work_id'], how='left')
    valid = valid.merge(s1, on=['user_id', 'work_id'], how='left').\
                  merge(s2, on=['user_id', 'work_id'], how='left').\
                  merge(s3, on=['user_id', 'work_id'], how='left')

    train.to_csv(data+"../baseline/level1/train_{0}.csv".format(i), index=False)
    valid.to_csv(data+"../baseline/level1/valid_{0}.csv".format(i), index=False)
    


In [39]:
train = pd.read_csv(data+"../baseline/train_0.csv", usecols = ['user_id','work_id','rating'])
train = train.merge(s1, on=['user_id', 'work_id'], how='left').\
              merge(s2, on=['user_id', 'work_id'], how='left').\
              merge(s3, on=['user_id', 'work_id'], how='left')
train.to_csv(data+"../baseline/level1/train_0.csv", index=False)

s1 = pd.read_csv(data+"../features/level1/gbdt_test.csv", usecols = ['user_id','work_id','gbdt_prob'])
s2 = pd.read_csv(data+"../features/level1/rf_test.csv", usecols = ['user_id','work_id','rf_prob'])
s3 = pd.read_csv(data+"../features/level1/fm_test.csv", usecols = ['user_id','work_id','fm_prob'])
valid = pd.read_csv(data+"../baseline/test_0.csv", usecols = ['user_id','work_id'])
valid = valid.merge(s1, on=['user_id', 'work_id'], how='left').\
              merge(s2, on=['user_id', 'work_id'], how='left').\
              merge(s3, on=['user_id', 'work_id'], how='left')
valid.to_csv(data+"../baseline/level1/valid_0.csv", index=False)