In [2]:
import pandas as pd
import numpy as np

data = "/mnt/d/Data/mangaki-data-challenge/"

record = pd.read_csv(data+'watched.csv', dtype={
    'user_id': np.int16,
    'work_id': np.int16,
    'rating': 'category'
})

train_full = pd.read_csv(data+'train_withcv.csv', dtype={
    'user_id': np.int16,
    'work_id': np.int16,
    'rating': np.int8,
    'cv': np.int8
})

test = pd.read_csv(data+'test.csv', dtype={
    'user_id': np.int16,
    'work_id': np.int16
})

In [3]:
u1 = pd.crosstab(record.user_id, record.rating).add_prefix('user_').apply(lambda r: r/r.sum(), axis=1)

In [4]:
ratelut = {
    'dislike':1.0,
    'like':3.0,
    'love':4.0,
    'neutral':2.0
}
record['score']=record.rating.map(lambda x: ratelut[x]).astype(float)
u2 = record[['user_id', 'score']].groupby(by='user_id')['score'].agg(['mean', 'std']).rename(columns={'mean':'user_mean', 'std':'user_std'})

In [22]:
#train_full = train
train=train_full[train_full.cv==1]

In [10]:
u3 = pd.crosstab(train.user_id, train.rating).add_prefix('user_').apply(lambda r: r/r.sum(), axis=1)

In [5]:
i1 = pd.crosstab(record.work_id, record.rating).add_prefix('work_').apply(lambda r: r/r.sum(), axis=1)
i2 = record[['work_id', 'score']].groupby(by='work_id')['score'].agg(['mean', 'std']).add_prefix('item_')

In [16]:
train = train_full[train_full.cv==3]
valid = train_full[train_full.cv.isin([1,2])]
u3 = pd.crosstab(train.user_id, train.rating).add_prefix('user_').apply(lambda r: r/r.sum(), axis=1)
i3 = pd.crosstab(train.work_id, train.rating).add_prefix('item_').apply(lambda r: r/r.sum(), axis=1)

In [17]:
train = train.merge(u1, left_on='user_id', right_index=True, how='left').\
merge(u2, left_on='user_id', right_index=True, how='left').\
merge(u3, left_on='user_id', right_index=True, how='left').\
merge(i1, left_on='work_id', right_index=True, how='left').\
merge(i2, left_on='work_id', right_index=True, how='left').\
merge(i3, left_on='work_id', right_index=True, how='left')

In [18]:
train = train.drop(['user_id', 'work_id', 'cv'], axis=1)
train.to_csv(data+"train_3.csv", index=False)

In [19]:
valid = valid.merge(u1, left_on='user_id', right_index=True, how='left').\
merge(u2, left_on='user_id', right_index=True, how='left').\
merge(u3, left_on='user_id', right_index=True, how='left').\
merge(i1, left_on='work_id', right_index=True, how='left').\
merge(i2, left_on='work_id', right_index=True, how='left').\
merge(i3, left_on='work_id', right_index=True, how='left')
valid = valid.drop(['user_id', 'work_id', 'cv'], axis=1)

In [20]:
valid.to_csv(data+"valid_3.csv", index=False)

In [34]:
import lightgbm as lgb
from sklearn.metrics import f1_score, roc_auc_score

In [77]:
X = lgb.Dataset(train.drop('rating', axis=1), train['rating'])
V = lgb.Dataset(valid.drop('rating', axis=1), valid['rating'], reference=X)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'auc'},
    'num_leaves': 20,
    'min_data_in_leaf': 800,
    'feature_fraction': 0.5,
    'bagging_fraction': 1,
    'verbose': 1
}

In [78]:
gbdt = lgb.train(params, X, valid_sets=[X,V],early_stopping_rounds=20)

[1]	training's auc: 0.861324	valid_1's auc: 0.685332
Training until validation scores don't improve for 20 rounds.
[2]	training's auc: 0.900989	valid_1's auc: 0.694957
[3]	training's auc: 0.909735	valid_1's auc: 0.727588
[4]	training's auc: 0.908671	valid_1's auc: 0.746335
[5]	training's auc: 0.919939	valid_1's auc: 0.757441
[6]	training's auc: 0.928101	valid_1's auc: 0.761275
[7]	training's auc: 0.928261	valid_1's auc: 0.765559
[8]	training's auc: 0.930713	valid_1's auc: 0.768853
[9]	training's auc: 0.938168	valid_1's auc: 0.769829
[10]	training's auc: 0.941206	valid_1's auc: 0.767945
[11]	training's auc: 0.94848	valid_1's auc: 0.764205
[12]	training's auc: 0.949029	valid_1's auc: 0.765623
[13]	training's auc: 0.949092	valid_1's auc: 0.767825
[14]	training's auc: 0.951325	valid_1's auc: 0.763372
[15]	training's auc: 0.954505	valid_1's auc: 0.757294
[16]	training's auc: 0.955919	valid_1's auc: 0.754244
[17]	training's auc: 0.955879	valid_1's auc: 0.755022
[18]	training's auc: 0.957224	