In [33]:
from collections import Counter
from itertools import combinations
from math import sqrt
import random
from keras.layers import Concatenate, Dense, Dot, Dropout, Embedding, Input, Reshape, LeakyReLU, Add
from keras.models import Model
from keras.callbacks import Callback, ModelCheckpoint
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import tensorflow


STUDENT_ID = '23846183'

random.seed(2019)
np.random.seed(2019)
tensorflow.set_random_seed(2019)


# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))


def build_deepwide_model(len_continuous, deep_vocab_lens, len_wide, embed_size):
    input_list = []
    continuous_input = Input(shape=(len_continuous,), dtype='float32', name='continuous_input')
    input_list.append(continuous_input)

    emb_list = []
    for vocab_size in deep_vocab_lens:
        _input = Input(shape=(1,), dtype='int32')
        input_list.append(_input)
        _emb = Embedding(output_dim=embed_size, input_dim=vocab_size, input_length=1)(_input)
        _emb = Reshape((embed_size,))(_emb)
        emb_list.append(_emb)
    
    wide_input = Input(shape=(len_wide,), dtype='float32')
    input_list.append(wide_input)
    
    deep_input = Concatenate()(emb_list + [continuous_input])
    
    
    dense_1 = Dense(256, activation='linear')(deep_input)
    dense_1 = LeakyReLU(alpha=0.001)(dense_1)
    #dense_1_dp = Dropout(0.01)(dense_1)
    
    dense_2 = Dense(128, activation='linear')(dense_1)
    dense_2 = Concatenate()([dense_2, dense_1])
    dense_2 = LeakyReLU(alpha=0.001)(dense_2)
    
    #dense_2_dp = Dropout(0.01)(dense_2)
    dense_3 = Dense(64, activation='linear')(dense_2)
    #dense_3_dp = Dropout(0.01)(dense_3)
    dense_2 = Concatenate()([dense_3, dense_2, dense_1])
    dense_3_dp = LeakyReLU(alpha=0.001)(dense_3)

    
    fc_input = Concatenate()([dense_1, dense_2, dense_3_dp, wide_input])
    
    fc_input = LeakyReLU(alpha=0.001)(fc_input)
    model_output = Dense(1)(fc_input)
    model = Model(inputs=input_list,
                  outputs=model_output)
    return model


def get_continuous_features(df, continuous_columns):
    continuous_features = df[continuous_columns].values
    return continuous_features


def get_top_k_p_combinations(df, comb_p, topk, output_freq=False):
    def get_category_combinations(categories_str, comb_p=2):
        categories = categories_str.split(', ')
        return list(combinations(categories, comb_p))
    all_categories_p_combos = df["item_categories"].apply(
        lambda x: get_category_combinations(x, comb_p)).values.tolist()
    all_categories_p_combos = [tuple(t) for item in all_categories_p_combos for t in item]
    tmp = dict(Counter(all_categories_p_combos))
    sorted_categories_combinations = list(sorted(tmp.items(), key=lambda x: x[1], reverse=True))
    if output_freq:
        return sorted_categories_combinations[:topk]
    else:
        return [t[0] for t in sorted_categories_combinations[:topk]]


def get_wide_features(df):
    def categories_to_binary_output(categories):
        binary_output = [0 for _ in range(len(selected_categories_to_idx))]
        for category in categories.split(', '):
            if category in selected_categories_to_idx:
                binary_output[selected_categories_to_idx[category]] = 1
            else:
                binary_output[0] = 1
        return binary_output
    def categories_cross_transformation(categories):
        current_category_set = set(categories.split(', '))
        corss_transform_output = [0 for _ in range(len(top_combinations))]
        for k, comb_k in enumerate(top_combinations):
            if len(current_category_set & comb_k) == len(comb_k):
                corss_transform_output[k] = 1
            else:
                corss_transform_output[k] = 0
        return corss_transform_output

    category_binary_features = np.array(df.item_categories.apply(
        lambda x: categories_to_binary_output(x)).values.tolist())
    category_corss_transform_features = np.array(df.item_categories.apply(
        lambda x: categories_cross_transformation(x)).values.tolist())
    return np.concatenate((category_binary_features, category_corss_transform_features), axis=1)


if __name__ == "__main__":
    tr_df = pd.read_csv("data/train.csv")
    val_df = pd.read_csv("data/valid.csv")
    te_df = pd.read_csv("data/test.csv")

    tr_ratings = tr_df.stars.values
    val_ratings = val_df.stars.values

    user_df = pd.read_json("data/user.json")
    item_df = pd.read_json("data/business.json")
    user_df = user_df.rename(index=str, columns={t: 'user_' + t for t in user_df.columns if t != 'user_id'})
    item_df = item_df.rename(index=str, columns={t: 'item_' + t for t in item_df.columns if t != 'business_id'})

    tr_df["index"] = tr_df.index
    val_df["index"]  = val_df.index
    te_df["index"] = te_df.index
    tr_df = pd.merge(pd.merge(tr_df, user_df, on='user_id'), item_df, on='business_id').sort_values(by=['index']).reset_index(drop=True)
    val_df = pd.merge(pd.merge(val_df, user_df, on='user_id'), item_df, on='business_id').sort_values(by=['index']).reset_index(drop=True)
    te_df = pd.merge(pd.merge(te_df, user_df, on='user_id'), item_df, on='business_id').sort_values(by=['index']).reset_index(drop=True)

    # Continuous features
    print("Prepare continuous features...")
    continuous_columns = ["user_average_stars",
#                           "user_cool", 
#                           "user_fans", 
                          "user_review_count", "user_useful",
#                           "user_funny",
#                           "item_is_open", "item_latitude", "item_longitude", 
                          "item_review_count", "item_stars"]

    tr_continuous_features = get_continuous_features(tr_df, continuous_columns)
    val_continuous_features = get_continuous_features(val_df, continuous_columns)
    te_continuous_features = get_continuous_features(te_df, continuous_columns)
    scaler = StandardScaler().fit(tr_continuous_features)
    tr_continuous_features = scaler.transform(tr_continuous_features)
    val_continuous_features = scaler.transform(val_continuous_features)
    te_continuous_features = scaler.transform(te_continuous_features)

    # Deep features
    print("Prepare deep features...")
    item_deep_columns = ["item_city", "item_postal_code", "item_state"]
    item_deep_vocab_lens = []
    for col_name in item_deep_columns:
        tmp = item_df[col_name].unique()
        vocab = dict(zip(tmp, range(1, len(tmp) + 1)))
        item_deep_vocab_lens.append(len(vocab) + 1)
        item_df[col_name + "_idx"] = item_df[col_name].apply(lambda x: vocab[x] if x in vocab else 0)
    item_deep_idx_columns = [t + "_idx" for t in item_deep_columns]
    item_to_deep_features = dict(zip(item_df.business_id.values, item_df[item_deep_idx_columns].values.tolist()))
    tr_deep_features = np.array(tr_df.business_id.apply(lambda x: item_to_deep_features[x]).values.tolist())
    val_deep_features = np.array(val_df.business_id.apply(lambda x: item_to_deep_features[x]).values.tolist())
    te_deep_features = np.array(te_df.business_id.apply(lambda x: item_to_deep_features[x]).values.tolist())

    # Wide (Category) features
    
    # How to use attribute column for item
    print("Prepare wide features...")
    #   Prepare binary encoding for each selected categories
    all_categories = [category for category_list in item_df.item_categories.values for category in category_list.split(", ")]
    category_sorted = sorted(Counter(all_categories).items(), key=lambda x: x[1], reverse=True)
    selected_categories = [t[0] for t in category_sorted[:900]]
    selected_categories_to_idx = dict(zip(selected_categories, range(1, len(selected_categories) + 1)))
    selected_categories_to_idx['unk'] = 0
    idx_to_selected_categories = {val: key for key, val in selected_categories_to_idx.items()}
    #   Prepare Cross transformation for each categories
    top_combinations = []
    top_combinations += get_top_k_p_combinations(tr_df, 2, 200, output_freq=False)
    top_combinations += get_top_k_p_combinations(tr_df, 3, 30, output_freq=False)
    top_combinations += get_top_k_p_combinations(tr_df, 4, 20, output_freq=False)
    top_combinations += get_top_k_p_combinations(tr_df, 5, 10, output_freq=False)
    
    top_combinations = [set(t) for t in top_combinations]

    tr_wide_features = get_wide_features(tr_df)
    val_wide_features = get_wide_features(val_df)
    te_wide_features = get_wide_features(te_df)

    # Build input
    tr_features = []
    tr_features.append(tr_continuous_features.tolist())
    tr_features += [tr_deep_features[:,i].tolist() for i in range(len(tr_deep_features[0]))]
    tr_features.append(tr_wide_features.tolist())
    val_features = []
    val_features.append(val_continuous_features.tolist())
    val_features += [val_deep_features[:,i].tolist() for i in range(len(val_deep_features[0]))]
    val_features.append(val_wide_features.tolist())
    te_features = []
    te_features.append(te_continuous_features.tolist())
    te_features += [te_deep_features[:,i].tolist() for i in range(len(te_deep_features[0]))]
    te_features.append(te_wide_features.tolist())

    # Model training
    deepwide_model = build_deepwide_model(
        len(tr_continuous_features[0]),
        item_deep_vocab_lens,  
        len(tr_wide_features[0]), 
        embed_size=100)
    deepwide_model.compile(optimizer='adagrad', loss='mse')
    history = deepwide_model.fit(
        tr_features, 
        tr_ratings, 
        epochs=1, verbose=1, callbacks=[ModelCheckpoint('model.h5')])

    # Make Prediction
    y_pred = deepwide_model.predict(tr_features)
    print("TRAIN RMSE: ", rmse(y_pred, tr_ratings))
    y_pred = deepwide_model.predict(val_features)
    print("VALID RMSE: ", rmse(y_pred, val_ratings))
#     y_pred = deepwide_model.predict(te_features)
#     res_df = pd.DataFrame()
#     res_df['pred'] = y_pred[:, 0]
#     res_df.to_csv("{}.csv".format(STUDENT_ID), index=False)
#     print("Writing test predictions to file done.")



Prepare continuous features...
Prepare deep features...
Prepare wide features...
Epoch 1/1
TRAIN RMSE:  1.0241602816149316
VALID RMSE:  1.031410890952519


In [34]:
# total features available 
print(len(user_df.columns))
list(user_df.columns)

# selected features
# ["user_average_stars", "user_cool", "user_fans", 
#                           "user_review_count", "user_useful", "user_funny",
#                           "item_is_open", "item_latitude", "item_longitude", 
#                           "item_review_count", "item_stars"]

21


['user_average_stars',
 'user_compliment_cool',
 'user_compliment_cute',
 'user_compliment_funny',
 'user_compliment_hot',
 'user_compliment_list',
 'user_compliment_more',
 'user_compliment_note',
 'user_compliment_photos',
 'user_compliment_plain',
 'user_compliment_profile',
 'user_compliment_writer',
 'user_cool',
 'user_elite',
 'user_fans',
 'user_funny',
 'user_name',
 'user_review_count',
 'user_useful',
 'user_id',
 'user_yelping_since']

Unnamed: 0,user_average_stars,user_compliment_cool,user_compliment_cute,user_compliment_funny,user_compliment_hot,user_compliment_list,user_compliment_more,user_compliment_note,user_compliment_photos,user_compliment_plain,...,user_compliment_writer,user_cool,user_elite,user_fans,user_funny,user_name,user_review_count,user_useful,user_id,user_yelping_since
1019,4.14,0,0,0,0,0,0,0,0,0,...,0,1,,0,1,Lynne,35,8,110362ea155bc27f2faceb6dc76b4e5c,2015-08-02 14:37:56
1022,3.37,0,0,0,0,0,0,2,0,0,...,0,32,,0,23,Natasha,39,56,b29e7ca89b94b87372bb7f2f619d0370,2015-08-31 14:55:38
1025,4.28,0,0,0,1,0,0,1,0,1,...,0,0,,0,8,Elle,24,7,d1799915f2d5bc12f001dcb9df103042,2016-03-21 03:46:05
1027,4.16,0,0,0,0,0,0,1,0,0,...,0,17,,0,13,Tami,30,34,fa8b8042848b2d876fb28ca6652cf22d,2014-07-17 23:02:28
1055,3.63,0,0,0,1,0,1,0,0,3,...,0,11,,0,11,Rngox,96,81,4b5f6fe3906ac1270cdf81b40b6e3a0f,2011-08-21 18:54:41
1056,3.88,0,0,0,0,0,0,0,0,1,...,0,17,,0,12,Butch,32,44,b58d81735f484d67a77308b74a3722ca,2009-04-08 21:03:08
1058,4.00,0,0,0,0,0,1,0,0,0,...,0,5,,0,3,Mishell,27,32,5e20ce6186496644908ea094798ff04b,2015-04-15 01:01:08
1063,3.93,0,0,0,1,0,0,2,0,0,...,1,12,,0,8,C,43,97,8f01755e4582caf9b2b4d55bb4b0880a,2008-10-09 13:58:09
1066,3.96,0,0,0,0,0,0,0,0,2,...,0,14,,0,8,Winna,23,36,eeff9485f8ca9445268745967b651fe5,2015-02-20 18:57:59
1070,4.00,0,0,0,0,0,0,0,0,0,...,0,7,,0,4,Rilo,40,26,19691083b44502f78787d7d26bfe5701,2014-09-02 16:33:11


In [21]:
user_df

Unnamed: 0,user_average_stars,user_compliment_cool,user_compliment_cute,user_compliment_funny,user_compliment_hot,user_compliment_list,user_compliment_more,user_compliment_note,user_compliment_photos,user_compliment_plain,...,user_compliment_writer,user_cool,user_elite,user_fans,user_funny,user_name,user_review_count,user_useful,user_id,user_yelping_since
0,3.63,1,0,1,1,0,0,0,0,0,...,0,16,,4,22,Jenna,33,48,88422913727e71e88611fdfe3512fa03,2013-02-21 22:29:06
1,3.48,1,0,1,0,0,1,2,0,2,...,0,22,,3,16,Edie,77,71,e9567f1e494c12c4bed031c792a822d0,2009-10-26 01:00:40
10,3.91,67,5,67,50,3,5,28,8,39,...,16,411,,31,507,Andrew,311,632,9db7bc1b71703a41fd213acfcf886ee0,2007-07-14 00:50:13
100,3.89,5,0,5,0,0,1,7,17,7,...,3,215,20172018,21,127,Melody,337,581,356638928087978fbdadf2569dcee490,2012-12-13 03:24:06
1000,4.43,16,0,16,1,0,7,9,6,1,...,10,98,201620172018,6,64,Brady,137,162,a821031889b7e2a9cf491ee9bf13bd06,2011-03-26 14:05:58
1001,3.62,34,0,34,14,0,3,9,3,22,...,17,152,2015,10,106,Amy,111,266,c7773a50b020dd4827fc7930c586f8b2,2011-03-10 14:58:02
1002,3.59,0,0,0,1,0,1,3,1,2,...,2,72,20172018,4,38,Christine,128,135,54eea20a91548f256902a2a144e31cee,2014-05-08 08:41:58
1003,4.38,12,0,12,3,0,1,3,1,5,...,9,145,201320142015201620172018,10,100,Benjamin,316,359,b21ebde7fa59bfc3212b900399bff289,2010-01-21 18:38:34
1004,3.91,0,0,0,0,0,1,0,0,2,...,2,33,,2,604,Yue,55,96,0684846126a11236d848ff5b395298c4,2011-11-30 06:04:06
1005,4.11,14,0,14,2,0,0,6,2,10,...,3,120,,13,49,Holli,205,345,e56c7bf6b9baaaacfbd101c1e2191123,2011-01-04 15:03:06


In [24]:
item_df

Unnamed: 0,item_address,item_attributes,business_id,item_categories,item_city,item_hours,item_is_open,item_latitude,item_longitude,item_name,item_postal_code,item_review_count,item_stars,item_state,item_city_idx,item_postal_code_idx,item_state_idx
0,30 Eglinton Avenue W,"{'RestaurantsReservations': 'True', 'GoodForMe...",23f2cd62b65e6db173f5f40ed9f13a33,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,"{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",1,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON,1,1,1
1,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",f182f44b05b28d84ad291e4ed2fe5b6a,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC,2,2,2
10,322 Adelaide Street W,"{'Alcohol': 'u'beer_and_wine'', 'BikeParking':...",109ec8201ca31675ace1d83244c3413b,"Soup, Restaurants, Sandwiches",Toronto,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,43.647585,-79.392032,Ravi Soups,M5V 1R1,332,4.5,ON,3,3,1
100,"1005 S Arizona Ave, Ste 9","{'RestaurantsPriceRange2': '2', 'ByAppointment...",20879016f0fbd00b580741888438d5b4,"Eyelash Service, Hair Removal, Beauty & Spas, ...",Chandler,"{'Monday': '9:0-19:0', 'Tuesday': '9:0-19:0', ...",1,33.290984,-111.841126,Mirage Nails & Spa,85286,121,3.5,AZ,4,4,3
1000,15233 N Kierland Blvd,"{'RestaurantsAttire': ''dressy'', 'Restaurants...",79ba35a8e9c56de694874c37689a8a6a,"American (Traditional), Steakhouses, Restauran...",Scottsdale,"{'Monday': '17:0-22:0', 'Tuesday': '17:0-22:0'...",1,33.625352,-111.926167,Morton's The Steakhouse,85254,173,3.0,AZ,5,5,3
10000,"210 E Trade St, Ste 104A","{'BusinessAcceptsCreditCards': 'True', 'Restau...",160097fe729a2caa29cda43017312a6f,"Mexican, Restaurants",Charlotte,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,35.225882,-80.841993,Vida Mexican Kitchen y Cantina,28202,393,3.5,NC,2,6,2
10001,7050 W Ray Rd,"{'BikeParking': 'True', 'RestaurantsGoodForGro...",4c12a8b12c9e0c2d161848a1c33d80b0,"Burgers, Restaurants, Fast Food",Chandler,"{'Monday': '10:30-1:0', 'Tuesday': '10:30-1:0'...",1,33.320455,-111.964159,In-N-Out Burger,85226,164,4.0,AZ,4,7,3
10002,6140 Som Center Rd,"{'RestaurantsTableService': 'True', 'WiFi': 'u...",0beb5531e8212a8846d80915e85164dc,"Mexican, Restaurants",Solon,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",0,41.392161,-81.438536,Pacos Tacos,44139,135,2.5,OH,6,8,4
10003,"9711 S Eastern Ave, Ste H-11","{'GoodForKids': 'True', 'BusinessAcceptsCredit...",310ff8a69d91390620bb37e451977e3d,"Hair Salons, Barbers, Beauty & Spas, Hair Styl...",Las Vegas,"{'Monday': '10:0-20:0', 'Tuesday': '10:0-20:0'...",1,36.012499,-115.119687,Cool Cuts 4 Kids,89123,168,4.0,NV,7,9,5
10004,8475 S Emerald Dr,"{'BusinessParking': '{'garage': False, 'street...",8bad83206fd5fafdd8d3d6f921460c05,"Gun/Rifle Ranges, Active Life, Shopping, Guns ...",Tempe,"{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ...",1,33.338890,-111.968500,C2 Tactical,85284,245,4.0,AZ,8,10,3


In [129]:
from collections import Counter
from itertools import combinations
from math import sqrt
import random
from keras.layers import Concatenate, Dense, Dot, Dropout, Embedding, Input, Reshape
from keras.models import Model
from keras.callbacks import Callback, ModelCheckpoint
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import tensorflow


STUDENT_ID = '23846183'

random.seed(2019)
np.random.seed(2019)
tensorflow.set_random_seed(2019)


# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))


def build_deepwide_model(len_continuous, deep_vocab_lens, len_wide, embed_size):
    input_list = []
    continuous_input = Input(shape=(len_continuous,), dtype='float32', name='continuous_input')
    input_list.append(continuous_input)

    emb_list = []
    for vocab_size in deep_vocab_lens:
        _input = Input(shape=(1,), dtype='int32')
        input_list.append(_input)
        _emb = Embedding(output_dim=embed_size, input_dim=vocab_size, input_length=1)(_input)
        _emb = Reshape((embed_size,))(_emb)
        emb_list.append(_emb)

    deep_input = Concatenate()(emb_list + [continuous_input])
    
    dense_1 = Dense(100, activation='relu')(deep_input)
    dense_1_dp = Dropout(0.01)(dense_1)
    dense_2 = Dense(64, activation='relu')(dense_1_dp)
    dense_2_dp = Dropout(0.01)(dense_2)
    dense_3 = Dense(64, activation='relu')(dense_2_dp)
    dense_3_dp = Dropout(0.01)(dense_3)

    wide_input = Input(shape=(len_wide,), dtype='float32')
    input_list.append(wide_input)
    
    fc_input = Concatenate()([dense_3_dp, wide_input])
    # out_fc = Dense(256, activation='relu')(fc_input)
    model_output = Dense(1)(fc_input)
    model = Model(inputs=input_list,
                  outputs=model_output)
    return model


def get_continuous_features(df, continuous_columns):
    continuous_features = df[continuous_columns].values
    return continuous_features


def get_top_k_p_combinations(df, comb_p, topk, output_freq=False):
    def get_category_combinations(categories_str, comb_p=2):
        categories = categories_str.split(', ')
        return list(combinations(categories, comb_p))
    all_categories_p_combos = df["item_categories"].apply(
        lambda x: get_category_combinations(x, comb_p)).values.tolist()
    all_categories_p_combos = [tuple(t) for item in all_categories_p_combos for t in item]
    tmp = dict(Counter(all_categories_p_combos))
    sorted_categories_combinations = list(sorted(tmp.items(), key=lambda x: x[1], reverse=True))
    if output_freq:
        return sorted_categories_combinations[:topk]
    else:
        return [t[0] for t in sorted_categories_combinations[:topk]]


def get_wide_features(df):
    def categories_to_binary_output(categories):
        binary_output = [0 for _ in range(len(selected_categories_to_idx))]
        for category in categories.split(', '):
            if category in selected_categories_to_idx:
                binary_output[selected_categories_to_idx[category]] = 1
            else:
                binary_output[0] = 1
        return binary_output
    def categories_cross_transformation(categories):
        current_category_set = set(categories.split(', '))
        corss_transform_output = [0 for _ in range(len(top_combinations))]
        for k, comb_k in enumerate(top_combinations):
            if len(current_category_set & comb_k) == len(comb_k):
                corss_transform_output[k] = 1
            else:
                corss_transform_output[k] = 0
        return corss_transform_output

    category_binary_features = np.array(df.item_categories.apply(
        lambda x: categories_to_binary_output(x)).values.tolist())
    category_corss_transform_features = np.array(df.item_categories.apply(
        lambda x: categories_cross_transformation(x)).values.tolist())
    return np.concatenate((category_binary_features, category_corss_transform_features), axis=1)

In [122]:

tr_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/valid.csv")
te_df = pd.read_csv("data/test.csv")

tr_ratings = tr_df.stars.values
val_ratings = val_df.stars.values

user_df = pd.read_json("data/user.json")
item_df = pd.read_json("data/business.json")
user_df = user_df.rename(index=str, columns={t: 'user_' + t for t in user_df.columns if t != 'user_id'})
item_df = item_df.rename(index=str, columns={t: 'item_' + t for t in item_df.columns if t != 'business_id'})

tr_df["index"] = tr_df.index
val_df["index"]  = val_df.index
te_df["index"] = te_df.index
tr_df = pd.merge(pd.merge(tr_df, user_df, on='user_id'), item_df, on='business_id').sort_values(by=['index']).reset_index(drop=True)
val_df = pd.merge(pd.merge(val_df, user_df, on='user_id'), item_df, on='business_id').sort_values(by=['index']).reset_index(drop=True)
te_df = pd.merge(pd.merge(te_df, user_df, on='user_id'), item_df, on='business_id').sort_values(by=['index']).reset_index(drop=True)

# Continuous features
print("Prepare continuous features...")
continuous_columns = ["user_average_stars", "user_cool", "user_fans", 
                      "user_review_count", "user_useful", "user_funny",
                      "item_is_open", "item_latitude", "item_longitude", 
                      "item_review_count", "item_stars"]
tr_continuous_features = get_continuous_features(tr_df, continuous_columns)
val_continuous_features = get_continuous_features(val_df, continuous_columns)
te_continuous_features = get_continuous_features(te_df, continuous_columns)
scaler = StandardScaler().fit(tr_continuous_features)
tr_continuous_features = scaler.transform(tr_continuous_features)
val_continuous_features = scaler.transform(val_continuous_features)
te_continuous_features = scaler.transform(te_continuous_features)

# Deep features
print("Prepare deep features...")
# item_deep_columns = ["item_city", "item_postal_code", "item_state"]
item_deep_columns = ["item_city", "item_postal_code", "item_state"]
item_deep_vocab_lens = []
for col_name in item_deep_columns:
    tmp = item_df[col_name].unique()
    vocab = dict(zip(tmp, range(1, len(tmp) + 1)))
    item_deep_vocab_lens.append(len(vocab) + 1)
    item_df[col_name + "_idx"] = item_df[col_name].apply(lambda x: vocab[x] if x in vocab else 0)
item_deep_idx_columns = [t + "_idx" for t in item_deep_columns]
item_to_deep_features = dict(zip(item_df.business_id.values, item_df[item_deep_idx_columns].values.tolist()))
tr_deep_features = np.array(tr_df.business_id.apply(lambda x: item_to_deep_features[x]).values.tolist())
val_deep_features = np.array(val_df.business_id.apply(lambda x: item_to_deep_features[x]).values.tolist())
te_deep_features = np.array(te_df.business_id.apply(lambda x: item_to_deep_features[x]).values.tolist())

# Wide (Category) features
print("Prepare wide features...")
#   Prepare binary encoding for each selected categories
all_categories = [category for category_list in item_df.item_categories.values for category in category_list.split(", ")]
category_sorted = sorted(Counter(all_categories).items(), key=lambda x: x[1], reverse=True)
selected_categories = [t[0] for t in category_sorted[:00]]
selected_categories_to_idx = dict(zip(selected_categories, range(1, len(selected_categories) + 1)))
selected_categories_to_idx['unk'] = 0
idx_to_selected_categories = {val: key for key, val in selected_categories_to_idx.items()}
#   Prepare Cross transformation for each categories
top_combinations = []
top_combinations += get_top_k_p_combinations(tr_df, 2, 200, output_freq=False)
top_combinations += get_top_k_p_combinations(tr_df, 3, 80, output_freq=False)
top_combinations += get_top_k_p_combinations(tr_df, 4, 40, output_freq=False)
top_combinations += get_top_k_p_combinations(tr_df, 5, 20, output_freq=False)

top_combinations = [set(t) for t in top_combinations]

tr_wide_features = get_wide_features(tr_df)
val_wide_features = get_wide_features(val_df)
te_wide_features = get_wide_features(te_df)

# Build input
tr_features = []
tr_features.append(tr_continuous_features.tolist())
tr_features += [tr_deep_features[:,i].tolist() for i in range(len(tr_deep_features[0]))]
tr_features.append(tr_wide_features.tolist())
val_features = []
val_features.append(val_continuous_features.tolist())
val_features += [val_deep_features[:,i].tolist() for i in range(len(val_deep_features[0]))]
val_features.append(val_wide_features.tolist())
te_features = []
te_features.append(te_continuous_features.tolist())
te_features += [te_deep_features[:,i].tolist() for i in range(len(te_deep_features[0]))]
te_features.append(te_wide_features.tolist())




Prepare continuous features...
Prepare deep features...
Prepare wide features...


In [123]:
# Model training
print(len(tr_continuous_features[0]))
print(tr_continuous_features[0])
print(item_deep_vocab_lens)
print(len(tr_wide_features[0]))

11
[-1.45650354 -0.13715845  0.37009944  1.56067061  0.10171985 -0.0509457
  0.33516366 -0.13118059 -0.66911938  2.89952001  0.32439266]
[187, 1436, 14]
841


In [124]:
deepwide_model = build_deepwide_model(
    len(tr_continuous_features[0]),
    item_deep_vocab_lens,  
    len(tr_wide_features[0]), 
    embed_size=110)
deepwide_model.compile(optimizer='adagrad', loss='mse')
history = deepwide_model.fit(
    tr_features, 
    tr_ratings,
    batch_size=1024,
    epochs=5, verbose=1, callbacks=[ModelCheckpoint('model.h5')])
# ,validation_data=(val_features,val_ratings)
# Make Prediction
y_pred = deepwide_model.predict(tr_features)                                        
print("TRAIN RMSE: ", rmse(y_pred, tr_ratings))
y_pred = deepwide_model.predict(val_features)
print("VALID RMSE: ", rmse(y_pred, val_ratings))
#     y_pred = deepwide_model.predict(te_features)
#     res_df = pd.DataFrame()
#     res_df['pred'] = y_pred[:, 0]
#     res_df.to_csv("{}.csv".format(STUDENT_ID), index=False)
#     print("Writing test predictions to file done.")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
TRAIN RMSE:  1.0221280698609931
VALID RMSE:  1.040743941618628
