In [1]:
import warnings
warnings.simplefilter('ignore')

import gc
from tqdm import tqdm
tqdm.pandas()

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, label_binarize
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold, train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score

from deepctr.models import xDeepFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

In [2]:
# 读取、合并

df_book = pd.read_csv('raw_data/book.csv', encoding="ISO-8859-1")
df_user = pd.read_csv('raw_data/user.csv', encoding="ISO-8859-1")

df_train = pd.read_csv('raw_data/train.csv')
df_test = pd.read_csv('raw_data/test.csv')

df = pd.concat([df_train, df_test[['User-ID', 'ISBN']]])
df = pd.merge(df, df_book, on='ISBN', how='left')
df = pd.merge(df, df_user, on='User-ID', how='left')

# 基于 user-id 的评分 target encoding 特征, 对于 test 集里未见过的 user-id fill 均值
# 此外还有基于 author 的评分 target encoding 特征, 会造成 leakage, 慎用
train_user = set(df_train.drop_duplicates(subset=['User-ID'])['User-ID'].values.tolist())
test_user = set(df_test.drop_duplicates(subset=['User-ID'])['User-ID'].values.tolist())

tmp = df[df['User-ID'].isin(train_user)]
for method in ['mean', 'max', 'min', 'std', 'skew']:
    tmp[f'user_book_rating_{method}'] = tmp.groupby(['User-ID'])['Book-Rating'].transform(method)

cols = [f'user_book_rating_{method}' for method in ['mean', 'max', 'min', 'std']]        
tmp = tmp[['User-ID'] + cols].drop_duplicates(subset=['User-ID'])
df = pd.merge(df, tmp, on='User-ID', how='left')

for col in cols:
    df[col] = df[col].fillna(df[col].mean())

# Age 数据清理
df.loc[(df['Age'] > 90) | (df['Age'] < 5), 'Age'] = np.nan

# location 拆分为 city 和 country
df['Location_split_len'] = df.Location.apply(lambda x: len(x.split(',')))

def clean_location(x):
    li = x.split(',')
    le = len(li)
    if le > 3:
        s = li[0]
        e = li[-1]
        m = "_".join(li[1:-1])
        return ",".join([s,m,e])
    else:
        return x
    
df['Location_clean'] = df['Location'].apply(lambda x: clean_location(x))
df['City'] = df['Location_clean'].apply(lambda x: "_".join(x.split(',')[0:-1]))
df['Country'] = df['Location_clean'].apply(lambda x: x.split(',')[2])

del df['Location'], df['Location_clean'], df['Location_split_len']
gc.collect()

# year-of-publication 数据清理
df.loc[df['Year-Of-Publication']=='Amit Chaudhuri', 'Year-Of-Publication'] = 0
df['Year-Of-Publication'] = df['Year-Of-Publication'].astype('int')
df.loc[df['Year-Of-Publication'] > 2020, 'Year-Of-Publication'] = 2020

# user-id count 统计特征
df['book_counts'] = df.groupby(['User-ID'])['Book-Title'].transform('count')
df['author_counts'] = df.groupby(['User-ID', 'Book-Author'])['Book-Author'].transform('count')
df['publisher_counts'] = df.groupby(['User-ID', 'Publisher'])['Publisher'].transform('count')
df['year_counts'] = df.groupby(['User-ID', 'Year-Of-Publication'])['Year-Of-Publication'].transform('count')


# 类别特征 label encoding 以及 frequence encoding
def freq_enc(df, col):
    vc = df[col].value_counts(dropna=True, normalize=True).to_dict()
    df[f'{col}_freq'] = df[col].map(vc)
    return df

for f in tqdm(['ISBN', 'User-ID', 'Book-Title', 'Book-Author', 'Publisher', 'City', 'Country']):
    lbl = LabelEncoder()
    df[f] = lbl.fit_transform(df[f].astype('str'))

for f in tqdm(['Book-Title', 'Book-Author', 'Publisher', 'City', 'Country']):
    df = freq_enc(df, f)

# 基于特征的 target encoding    
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)
    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')
    del (group)
    gc.collect()
    return df_merge
    

def statis_feat(df_know, df_unknow):
    df_unknow = stat(df_know, df_unknow, ['book_counts'], {'Book-Rating': ['mean', 'std', 'max', 'min']})
    df_unknow = stat(df_know, df_unknow, ['Age'], {'Book-Rating': ['mean', 'std', 'max']})
    df_unknow = stat(df_know, df_unknow, ['Book-Title'], {'Book-Rating': ['mean', 'std', 'max', 'min']})
    df_unknow = stat(df_know, df_unknow, ['Book-Author'], {'Book-Rating': ['mean', 'std', 'max', 'min']})
    df_unknow = stat(df_know, df_unknow, ['Publisher'], {'Book-Rating': ['mean', 'std', 'max', 'min']})
    return df_unknow
    
    
df_train = df[~df['Book-Rating'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df[df['Book-Rating'].isnull()]
df_stas_feat = None
kf = KFold(n_splits=5, random_state=2020, shuffle=True)
for train_index, val_index in kf.split(df_train):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]
    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)
    del(df_fold_train)
    del(df_fold_val)
    gc.collect()


df_test = statis_feat(df_train, df_test)
df = pd.concat([df_stas_feat, df_test], axis=0)

del(df_stas_feat)
del(df_train)
del(df_test)
gc.collect()


df.head()

100%|██████████| 7/7 [00:03<00:00,  1.79it/s]
100%|██████████| 5/5 [00:00<00:00,  6.61it/s]


Unnamed: 0,Book-Rating,ISBN,User-ID,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,user_book_rating_mean,user_book_rating_max,user_book_rating_min,user_book_rating_std,City,Country,book_counts,author_counts,publisher_counts,year_counts,Book-Title_freq,Book-Author_freq,Publisher_freq,City_freq,Country_freq,book_counts_Book-Rating_mean,book_counts_Book-Rating_std,book_counts_Book-Rating_max,book_counts_Book-Rating_min,Age_Book-Rating_mean,Age_Book-Rating_std,Age_Book-Rating_max,Book-Title_Book-Rating_mean,Book-Title_Book-Rating_std,Book-Title_Book-Rating_max,Book-Title_Book-Rating_min,Book-Author_Book-Rating_mean,Book-Author_Book-Rating_std,Book-Author_Book-Rating_max,Book-Author_Book-Rating_min,Publisher_Book-Rating_mean,Publisher_Book-Rating_std,Publisher_Book-Rating_max,Publisher_Book-Rating_min
0,5.0,197506,34258,190476,85141,2004,6776,,2.615385,7.0,0.0,2.987152,10348,24,17,1.0,1.0,1,2e-06,2e-06,2.1e-05,3.5e-05,0.089821,3.663803,3.92193,10.0,0.0,,,,0.0,,0.0,0.0,0.0,,0.0,0.0,2.769231,4.342692,10.0,0.0
1,6.0,224166,34258,118062,79650,1999,7675,,2.615385,7.0,0.0,2.987152,10348,24,17,1.0,1.0,4,3e-06,1e-05,0.00014,3.5e-05,0.089821,3.663803,3.92193,10.0,0.0,,,,0.0,0.0,0.0,0.0,2.857143,3.670993,8.0,0.0,1.73494,3.127706,10.0,0.0
2,0.0,209973,5961,226143,77609,2004,14099,,3.333333,5.0,0.0,2.886751,6385,168,4,1.0,1.0,1,0.001906,0.001906,0.001906,6e-06,0.732169,4.318747,3.906846,10.0,0.0,,,,0.798068,1.941304,10.0,0.0,0.798068,1.941304,10.0,0.0,0.798068,1.941304,10.0,0.0
3,0.0,35869,10303,227246,81734,2001,13303,,3.0,7.0,0.0,3.34664,1948,168,7,1.0,1.0,1,0.00018,0.001027,0.006033,0.001053,0.732169,4.046679,3.927762,10.0,0.0,,,,2.084906,3.434053,10.0,0.0,2.290216,3.632223,10.0,0.0,2.616564,3.798252,10.0,0.0
4,0.0,246320,17609,46627,25492,2001,5708,,2.333333,7.0,0.0,4.041452,3190,57,4,1.0,3.0,1,2e-06,1.4e-05,0.002693,1.1e-05,0.023979,4.318747,3.906846,10.0,0.0,,,,0.0,,0.0,0.0,2.333333,3.741657,10.0,0.0,3.22438,3.826484,10.0,0.0


In [3]:
target = ['Book-Rating']

sparse_features = ['ISBN', 'User-ID', 'Book-Title', 'Book-Author', 'Publisher', 'City', 'Country']
dense_features = ['Year-Of-Publication', 'Age', 
                  'user_book_rating_mean', 'user_book_rating_max', 
                  'user_book_rating_min', 'user_book_rating_std',
                  'book_counts', 'author_counts', 'publisher_counts','year_counts', 
                  'Book-Title_freq', 'Book-Author_freq', 'Publisher_freq',
                  'City_freq', 'Country_freq', 'book_counts_Book-Rating_mean',
                  'book_counts_Book-Rating_std', 'book_counts_Book-Rating_max',
                  'book_counts_Book-Rating_min', 'Age_Book-Rating_mean',
                  'Age_Book-Rating_std', 'Age_Book-Rating_max',
                  'Book-Title_Book-Rating_mean', 'Book-Title_Book-Rating_std',
                  'Book-Title_Book-Rating_max', 'Book-Title_Book-Rating_min',
                  'Book-Author_Book-Rating_mean', 'Book-Author_Book-Rating_std',
                  'Book-Author_Book-Rating_max', 'Book-Author_Book-Rating_min',
                  'Publisher_Book-Rating_mean', 'Publisher_Book-Rating_std',
                  'Publisher_Book-Rating_max', 'Publisher_Book-Rating_min'
                 ]

df[sparse_features] = df[sparse_features].fillna('-1', )
df[dense_features] = df[dense_features].fillna(0, )

mms = MinMaxScaler(feature_range=(0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

df.head()

Unnamed: 0,Book-Rating,ISBN,User-ID,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,user_book_rating_mean,user_book_rating_max,user_book_rating_min,user_book_rating_std,City,Country,book_counts,author_counts,publisher_counts,year_counts,Book-Title_freq,Book-Author_freq,Publisher_freq,City_freq,Country_freq,book_counts_Book-Rating_mean,book_counts_Book-Rating_std,book_counts_Book-Rating_max,book_counts_Book-Rating_min,Age_Book-Rating_mean,Age_Book-Rating_std,Age_Book-Rating_max,Book-Title_Book-Rating_mean,Book-Title_Book-Rating_std,Book-Title_Book-Rating_max,Book-Title_Book-Rating_min,Book-Author_Book-Rating_mean,Book-Author_Book-Rating_std,Book-Author_Book-Rating_max,Book-Author_Book-Rating_min,Publisher_Book-Rating_mean,Publisher_Book-Rating_std,Publisher_Book-Rating_max,Publisher_Book-Rating_min
0,5.0,197506,34258,190476,85141,0.992079,6776,0.0,0.261538,0.7,0.0,0.422447,10348,24,0.001436,0.003636,0.001287,0.0,0.000556,0.00011,0.000601,0.002286,0.122677,0.408259,0.782534,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.276923,0.614149,1.0,0.0
1,6.0,224166,34258,118062,79650,0.989604,7675,0.0,0.261538,0.7,0.0,0.422447,10348,24,0.001436,0.003636,0.001287,0.00159,0.001112,0.000878,0.004142,0.002286,0.122677,0.408259,0.782534,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.519157,0.8,0.0,0.173494,0.442324,1.0,0.0
2,0.0,209973,5961,226143,77609,0.992079,14099,0.0,0.333333,0.5,0.0,0.408248,6385,168,0.000269,0.003636,0.001287,0.0,1.0,0.197366,0.056852,0.000357,1.0,0.48124,0.779525,1.0,0.0,0.0,0.0,0.0,0.079807,0.274542,1.0,0.0,0.079807,0.274542,1.0,0.0,0.079807,0.274542,1.0,0.0
3,0.0,35869,10303,227246,81734,0.990594,13303,0.0,0.3,0.7,0.0,0.473286,1948,168,0.000538,0.003636,0.001287,0.0,0.093993,0.106367,0.180073,0.070924,1.0,0.450924,0.783698,1.0,0.0,0.0,0.0,0.0,0.208491,0.485648,1.0,0.0,0.229022,0.513674,1.0,0.0,0.261656,0.537154,1.0,0.0
4,0.0,246320,17609,46627,25492,0.990594,5708,0.0,0.233333,0.7,0.0,0.571548,3190,57,0.000269,0.003636,0.003861,0.0,0.000556,0.001317,0.080345,0.000643,0.03275,0.48124,0.779525,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233333,0.52915,1.0,0.0,0.322438,0.541147,1.0,0.0


In [4]:
df_train = df[df['Book-Rating'].notna()]
df_train.head()

Unnamed: 0,Book-Rating,ISBN,User-ID,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,user_book_rating_mean,user_book_rating_max,user_book_rating_min,user_book_rating_std,City,Country,book_counts,author_counts,publisher_counts,year_counts,Book-Title_freq,Book-Author_freq,Publisher_freq,City_freq,Country_freq,book_counts_Book-Rating_mean,book_counts_Book-Rating_std,book_counts_Book-Rating_max,book_counts_Book-Rating_min,Age_Book-Rating_mean,Age_Book-Rating_std,Age_Book-Rating_max,Book-Title_Book-Rating_mean,Book-Title_Book-Rating_std,Book-Title_Book-Rating_max,Book-Title_Book-Rating_min,Book-Author_Book-Rating_mean,Book-Author_Book-Rating_std,Book-Author_Book-Rating_max,Book-Author_Book-Rating_min,Publisher_Book-Rating_mean,Publisher_Book-Rating_std,Publisher_Book-Rating_max,Publisher_Book-Rating_min
0,5.0,197506,34258,190476,85141,0.992079,6776,0.0,0.261538,0.7,0.0,0.422447,10348,24,0.001436,0.003636,0.001287,0.0,0.000556,0.00011,0.000601,0.002286,0.122677,0.408259,0.782534,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.276923,0.614149,1.0,0.0
1,6.0,224166,34258,118062,79650,0.989604,7675,0.0,0.261538,0.7,0.0,0.422447,10348,24,0.001436,0.003636,0.001287,0.00159,0.001112,0.000878,0.004142,0.002286,0.122677,0.408259,0.782534,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.519157,0.8,0.0,0.173494,0.442324,1.0,0.0
2,0.0,209973,5961,226143,77609,0.992079,14099,0.0,0.333333,0.5,0.0,0.408248,6385,168,0.000269,0.003636,0.001287,0.0,1.0,0.197366,0.056852,0.000357,1.0,0.48124,0.779525,1.0,0.0,0.0,0.0,0.0,0.079807,0.274542,1.0,0.0,0.079807,0.274542,1.0,0.0,0.079807,0.274542,1.0,0.0
3,0.0,35869,10303,227246,81734,0.990594,13303,0.0,0.3,0.7,0.0,0.473286,1948,168,0.000538,0.003636,0.001287,0.0,0.093993,0.106367,0.180073,0.070924,1.0,0.450924,0.783698,1.0,0.0,0.0,0.0,0.0,0.208491,0.485648,1.0,0.0,0.229022,0.513674,1.0,0.0,0.261656,0.537154,1.0,0.0
4,0.0,246320,17609,46627,25492,0.990594,5708,0.0,0.233333,0.7,0.0,0.571548,3190,57,0.000269,0.003636,0.003861,0.0,0.000556,0.001317,0.080345,0.000643,0.03275,0.48124,0.779525,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233333,0.52915,1.0,0.0,0.322438,0.541147,1.0,0.0


In [5]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique(), embedding_dim=4)
                          for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                          for feat in dense_features]

linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

train_model_input = {name: df_train[name].values for name in feature_names}

In [6]:
model = xDeepFM(linear_feature_columns, 
                dnn_feature_columns, 
                task='regression',
                dnn_hidden_units=(256, 256),
                cin_layer_size=(64, 64), 
                cin_split_half=True, 
                cin_activation='relu', 
                l2_reg_linear=1e-5,
                l2_reg_embedding=1e-3, 
                l2_reg_dnn=1e-4, 
                l2_reg_cin=1e-3, 
                seed=1024, 
                dnn_dropout=0.5,
                dnn_activation='relu', 
                dnn_use_bn=False)

model.compile("adam", "mae", metrics=['mae'])

In [7]:
checkpoint = ModelCheckpoint('./xdeepfm_v3.h5',
                             monitor='val_mae',
                             mode='min',
                             verbose=1,
                             save_best_only=True,
                             save_weights_only=True)

reduce_lr = ReduceLROnPlateau(monitor='val_mae',
                              mode='min',
                              factor=0.1,
                              min_delta=1e-5,
                              cooldown=2,
                              min_lr=1e-6,
                              patience=3,
                              verbose=1)

early_stop = EarlyStopping(monitor='val_mae',
                           mode='min',
                           patience=8, 
                           min_delta=1e-5,
                           verbose=1)

history = model.fit(train_model_input, 
                    df_train[target].values,
                    batch_size=2048, 
                    epochs=10, 
                    verbose=2, 
                    validation_split=0.2,
                    callbacks=[checkpoint, reduce_lr, early_stop]
                   )

Epoch 1/10

Epoch 00001: val_mae improved from inf to 2.03156, saving model to ./xdeepfm_v3.h5
289/289 - 14s - loss: 2.3570 - mae: 2.2869 - val_loss: 2.0795 - val_mae: 2.0316
Epoch 2/10

Epoch 00002: val_mae improved from 2.03156 to 2.02029, saving model to ./xdeepfm_v3.h5
289/289 - 8s - loss: 2.0882 - mae: 2.0356 - val_loss: 2.0674 - val_mae: 2.0203
Epoch 3/10

Epoch 00003: val_mae improved from 2.02029 to 2.00435, saving model to ./xdeepfm_v3.h5
289/289 - 8s - loss: 2.0554 - mae: 1.9993 - val_loss: 2.0586 - val_mae: 2.0043
Epoch 4/10

Epoch 00004: val_mae improved from 2.00435 to 2.00287, saving model to ./xdeepfm_v3.h5
289/289 - 8s - loss: 2.0360 - mae: 1.9667 - val_loss: 2.0745 - val_mae: 2.0029
Epoch 5/10

Epoch 00005: val_mae improved from 2.00287 to 2.00028, saving model to ./xdeepfm_v3.h5
289/289 - 8s - loss: 2.0135 - mae: 1.9186 - val_loss: 2.1016 - val_mae: 2.0003
Epoch 6/10

Epoch 00006: val_mae did not improve from 2.00028
289/289 - 8s - loss: 1.9606 - mae: 1.8417 - val_los

In [8]:
model.load_weights('./xdeepfm_v3.h5')

In [9]:
df_test = df[df['Book-Rating'].isna()]
test_model_input = {name: df_test[name] for name in feature_names}
pred_ans = model.predict(test_model_input, batch_size=2048)

In [10]:
sub = pd.DataFrame({'ID': range(pred_ans.shape[0]), 'rating': pred_ans[:, 0]})
sub['rating'] = np.round(sub['rating'])
sub['rating'] = sub['rating'].apply(lambda x: x if x<=10 else 10)
sub['rating'] = sub['rating'].apply(lambda x: 0 if x<0 else x)
sub['rating'] = sub['rating'].astype('int')
sub['rating'].value_counts()

0     140349
7      10581
6       9957
8       9674
5       7856
1       6521
9       5245
4       5217
2       5130
3       4734
10       971
Name: rating, dtype: int64

In [11]:
sub.to_csv('deepctr_xdeepfm_v3.csv', index=False, header=False)