In [31]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

data = "/mnt/d/Data/mangaki-data-challenge/latest/"
import gc
gc.enable()

In [39]:
# for random forest

params = {
    'task': 'train',
    'boosting_type': 'rf',
    'objective': 'binary',
    'num_leaves': 550,
    'min_data_in_leaf': 6,
    'feature_fraction': 0.55,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0,
    'min_gain_to_split': 0,
    'verbose': 0
}

def training(train, valid):
    X = lgb.Dataset(train.drop(['user_id', 'work_id', 'rating'], axis=1), train['rating'])
    gbdt = lgb.train(params, X, num_boost_round=116)
    Yvp = gbdt.predict(valid.drop(['user_id', 'work_id', 'rating'], axis=1))
    return Yvp

t = pd.read_csv(data+'train_1.csv')
v = pd.read_csv(data+'valid_1.csv')
t['item_category'] = t.item_category.astype('category')
v['item_category'] = v.item_category.astype('category')
pv = training(t, v)
f = v[['user_id', 'work_id', 'rating']].copy()
f['rf_prob'] = pv
f['rf_loss'] = -f.rating*np.log(pv)-(1.0-f.rating)*np.log(1-pv)
rf = f
    

In [40]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'num_leaves': 40,
    'min_data_in_leaf': 90,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.96,
    'bagging_freq': 1,
    'lambda_l1': 0,
    'verbose': 0
}

def training(train, valid):
    X = lgb.Dataset(train.drop(['user_id', 'work_id', 'rating'], axis=1), train['rating'])
    gbdt = lgb.train(params, X, num_boost_round=119)
    Yvp = gbdt.predict(valid.drop(['user_id', 'work_id', 'rating'], axis=1))
    return Yvp

t = pd.read_csv(data+'train_1.csv')
v = pd.read_csv(data+'valid_1.csv')
t['item_category'] = t.item_category.astype('category')
v['item_category'] = v.item_category.astype('category')
pv = training(t, v)
f = v[['user_id', 'work_id', 'rating']].copy()
f['gbdt_prob'] = pv
f['gbdt_loss'] = -f.rating*np.log(pv)-(1.0-f.rating)*np.log(1-pv)
gbdt = f

In [41]:
from scipy.sparse import hstack
from fastFM import sgd
from sklearn.datasets import load_svmlight_file

libfmpath = "/mnt/d/Data/mangaki-data-challenge/libfm/"

params = {
    'n_iter':340000, 
    'init_stdev':0.01,
    'l2_reg_w':0.00025,
    'l2_reg_V':0.5,
    'rank':1,
    'step_size':0.02
}

def train_fm(train, valid, trainy):
    fm = sgd.FMClassification(**params)
    fm.fit(train, np.require(trainy*2-1, dtype=np.int))
    return fm.predict_proba(valid)

train, ty = load_svmlight_file(libfmpath+"train_1.csv")
valid, _ = load_svmlight_file(libfmpath+"valid_1.csv")
pv = train_fm(train, valid, ty)
v = pd.read_csv(data+'valid_1.csv')
f = v[['user_id', 'work_id', 'rating']].copy()
f['fm_prob'] = pv
f['fm_loss'] = -f.rating*np.log(pv)-(1.0-f.rating)*np.log(1-pv)
fm = f

In [46]:
pd.merge(gbdt, rf.drop('rating', axis=1), on=['user_id', 'work_id']).\
merge(fm.drop('rating', axis=1), on=['user_id', 'work_id']).to_csv('/mnt/d/Data/mangaki-data-challenge/analysis/analysis_full.csv', index=False)

In [22]:
# prepare metadata

data = "/mnt/d/Data/mangaki-data-challenge/"

record = pd.read_csv(data+'watched.csv', dtype={
    'user_id': np.int16,
    'work_id': np.int16,
    'rating': 'category'
})

title = pd.read_csv(data+'titles.csv', dtype={
    'work_id': np.int16,
    'title': str,
    'category': str
})

record = record.merge(title, on='work_id')

u1 = pd.crosstab(record[record.category=='anime'].user_id, record[record.category=='anime'].rating).add_prefix('user_anime_')
u2 = pd.crosstab(record[record.category=='manga'].user_id, record[record.category=='manga'].rating).add_prefix('user_manga_')

In [23]:
u3 = record.groupby(by='user_id')['work_id'].agg({'work_count': 'count'})
i1 = pd.crosstab(record.work_id, record.rating).add_prefix('work_')
i2 = record.groupby(by='work_id')['user_id'].agg({'user_count': 'count'})

In [27]:
record=record.merge(u1, left_on='user_id', right_index=True, how='left').merge(u2, left_on='user_id', right_index=True, how='left').\
merge(u3, left_on='user_id', right_index=True, how='left').merge(i1, left_on='work_id', right_index=True, how='left').\
merge(i2, left_on='work_id', right_index=True, how='left')

In [29]:
record.to_csv(data+"analysis/record.csv", index=False)

In [38]:
record

Unnamed: 0,user_id,work_id,rating,title,category,user_anime_dislike,user_anime_like,user_anime_love,user_anime_neutral,user_manga_dislike,user_manga_like,user_manga_love,user_manga_neutral,work_count,work_dislike,work_like,work_love,work_neutral,user_count
0,717,8025,dislike,Fairy Tail,anime,2.0,10.0,17.0,16.0,,,,,45,226,429,43,242,940
1,1222,8025,dislike,Fairy Tail,anime,9.0,44.0,0.0,12.0,,,,,65,226,429,43,242,940
2,40,8025,neutral,Fairy Tail,anime,3.0,40.0,64.0,14.0,0.0,1.0,8.0,0.0,130,226,429,43,242,940
3,1563,8025,dislike,Fairy Tail,anime,26.0,319.0,0.0,71.0,13.0,83.0,0.0,20.0,532,226,429,43,242,940
4,25,8025,like,Fairy Tail,anime,0.0,19.0,0.0,0.0,0.0,9.0,0.0,0.0,28,226,429,43,242,940
5,248,8025,like,Fairy Tail,anime,28.0,36.0,0.0,10.0,0.0,3.0,0.0,0.0,77,226,429,43,242,940
6,1967,8025,like,Fairy Tail,anime,2.0,23.0,0.0,1.0,2.0,12.0,0.0,0.0,40,226,429,43,242,940
7,992,8025,like,Fairy Tail,anime,2.0,40.0,0.0,4.0,0.0,3.0,0.0,0.0,49,226,429,43,242,940
8,1545,8025,neutral,Fairy Tail,anime,0.0,78.0,0.0,4.0,0.0,4.0,0.0,0.0,86,226,429,43,242,940
9,1048,8025,neutral,Fairy Tail,anime,21.0,101.0,15.0,61.0,31.0,122.0,4.0,31.0,386,226,429,43,242,940
