In [0]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf

import time

### In this notebook, we train the Wide and Deep model with the small dataset.

In [0]:
path="/content/drive/My Drive/yelp_final_data/"

We have experimented with different combination of features, below are our final choices.

In [0]:
CATEGORICAL_COLUMNS = ["user_id", "business_id",'city','state','categories']
CONTINUOUS_COLUMNS = ["average_stars",'compliment_more','compliment_cute',
'num_friends','stars','useful',
'funny','cool','fans','compliment_funny']
#CONTINUOUS_COLUMNS=[]
SURVIVED_COLUMN = "rating_review"

def build_estimator(model_dir,df,model_type=None):
  """Build an estimator."""
  # Categorical columns
  user_id = tf.contrib.layers.sparse_column_with_keys(column_name="user_id",
                                                     keys=[str(i) for i in list(np.unique(df['user_id']))])
  business_id = tf.contrib.layers.sparse_column_with_keys(column_name="business_id",
                                                   keys=[str(i) for i in list(np.unique(df['business_id']))])
  city= tf.contrib.layers.sparse_column_with_keys(column_name="city",
                                                     keys=[str(i) for i in list(np.unique(df['city']))])
  state= tf.contrib.layers.sparse_column_with_keys(column_name="state",
                                                     keys=[str(i) for i in list(np.unique(df['state']))])

  city2 = tf.contrib.layers.sparse_column_with_hash_bucket(
      "city", hash_bucket_size=len(np.unique(df['city'])))
  state2 = tf.contrib.layers.sparse_column_with_hash_bucket(
      "state", hash_bucket_size=700)
  categories = tf.contrib.layers.sparse_column_with_hash_bucket(
      "categories", hash_bucket_size=100)


  # Continuous columns
  average_stars = tf.contrib.layers.real_valued_column("average_stars")
  #useful_review = tf.contrib.layers.real_valued_column("useful_review")
  compliment_more = tf.contrib.layers.real_valued_column("compliment_more")
  compliment_cute = tf.contrib.layers.real_valued_column("compliment_cute")
  num_friends = tf.contrib.layers.real_valued_column("num_friends")
  #funny_review = tf.contrib.layers.real_valued_column("funny_review")
  #cool_review = tf.contrib.layers.real_valued_column("cool_review")
  stars = tf.contrib.layers.real_valued_column("stars")
  #freq_business = tf.contrib.layers.real_valued_column("freq_business")
  #freq_user = tf.contrib.layers.real_valued_column("freq_user")
  useful = tf.contrib.layers.real_valued_column("useful")
  funny = tf.contrib.layers.real_valued_column("funny")
  cool = tf.contrib.layers.real_valued_column("cool")
  fans = tf.contrib.layers.real_valued_column("fans")
  compliment_funny = tf.contrib.layers.real_valued_column("compliment_funny")
  
  # Transformations.

   # Wide columns and deep columns.
  wide_columns=[average_stars,compliment_more,compliment_cute,num_friends,city2,
                stars,state2,categories,
                tf.contrib.layers.crossed_column(
                      [city, state],
                      hash_bucket_size=int(1e4))]


  deep_columns = [
      tf.contrib.layers.embedding_column(user_id, dimension=32),
      tf.contrib.layers.embedding_column(business_id, dimension=32),
      tf.contrib.layers.embedding_column(city, dimension=8),
      #tf.contrib.layers.embedding_column(state, dimension=8),
      average_stars,
      compliment_more,
      stars,
      #freq_business,
      #freq_user,
      useful,
      funny,
      cool,
      fans,
      compliment_funny,
      
  
  ]
  optimazer=tf.train.ProximalAdagradOptimizer(learning_rate=0.1,
                                              l1_regularization_strength=0.001,
                                              l2_regularization_strength=0.001)
    # learning_rate=tf.exponential_decay(
    #     learning_rate=0.1,
    #     global_step=tf.get_global_step(),
    #     decay_steps=1000,
    #     decay_rate=0.96))
    
  if model_type=='DEEP':
    # return tf.contrib.learn.DNNRegressor(feature_columns=deep_columns,
    #                                     hidden_units=[100,50])
    return tf.estimator.DNNRegressor(feature_columns=deep_columns,hidden_units=[100,50,50])

  if model_type=='WD':

    return tf.estimator.DNNLinearCombinedRegressor(
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=[100,100,50,25],
        batch_norm=True,
        dnn_dropout=0.5,
        dnn_optimizer=optimazer
      )
    
  # tf.train.ProximalAdagradOptimizer(
  #   learning_rate=0.1,
  #   l1_regularization_strength=0.001,
  #   l2_regularization_strength=0.001)
# To apply learning rate decay, you can set dnn_optimizer to a callable:
    
  
    
  if model_type=='Linear':
   return tf.estimator.LinearRegressor(feature_columns=wide_columns)
    # return tf.contrib.learn.LinearRegressor(
            
    #         feature_columns=wide_columns)


  # return tf.contrib.learn.DNNLinearCombinedClassifier(
  #       linear_feature_columns=wide_columns,
  #       dnn_feature_columns=deep_columns,
  #       dnn_hidden_units=[100, 50])

def input_fn(df, train=False):
  """Input builder function."""
  # Creates a dictionary mapping from each continuous feature column name (k) to
  # the values of that column stored in a constant Tensor.
  if len(CONTINUOUS_COLUMNS)>0:
    continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
  # Creates a dictionary mapping from each categorical feature column name (k)
  # to the values of that column stored in a tf.SparseTensor.
  else:
    continuous_cols=[]

  if len(CATEGORICAL_COLUMNS)>0:
    categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      dense_shape=[df[k].size, 1])
                        for k in CATEGORICAL_COLUMNS}
  else:
    categorical_cols=[]
  # Merges the two dictionaries into one.
  feature_cols = dict(continuous_cols)
  feature_cols.update(categorical_cols)
  # Converts the label column into a constant Tensor.
  if train:
    label = tf.constant(df[SURVIVED_COLUMN].values)
      # Returns the feature columns and the label.
    return feature_cols, label
  else:
    return feature_cols

In [0]:
def train_input_fn6():
    titanic_file=path+'train_small_WD.csv'
    titanic = tf.data.experimental.make_csv_dataset(
        titanic_file, batch_size=128,
        label_name="rating_review")
    titanic_batches = (
        titanic.cache().repeat().shuffle(500)
        .prefetch(tf.data.experimental.AUTOTUNE))
    return titanic_batches

In [0]:
tf.logging.set_verbosity(tf.logging.INFO)

In [0]:
df_train = pd.read_csv(
      tf.gfile.Open(path+"train_small_WD.csv"),
      skipinitialspace=True)

In [0]:
#tf.logging.set_verbosity(tf.logging.INFO)
# df_train = pd.read_csv(
#       tf.gfile.Open(path+"new_train.csv"),
#       skipinitialspace=True)
# # df_test = pd.read_csv(
# #     tf.gfile.Open(path+"test4.csv"),
# #     skipinitialspace=True)

def create_model_dir(model_type):
  return 'models/model_' + model_type + '_' + str(int(time.time()))

model_type='DEEP'
model_dir=create_model_dir(model_type)
print(model_dir)
print("model directory = %s" % model_dir)

m = build_estimator(model_dir=model_dir,df=df_train,model_type=model_type)
m = m.train(input_fn=train_input_fn6,  steps=210000)

models/model_DEEP_1576612244
model directory = models/model_DEEP_1576612244
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpclkgnw6r', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute

In [0]:
def test_input_fn6():
    titanic_file=path+'test_small_WD.csv'
    
    titanic = tf.data.experimental.make_csv_dataset(
        titanic_file, batch_size=100,
        num_epochs=1,
        shuffle=False,
        label_name="rating_review")
    # titanic_batches = (
    #     titanic.cache()
    #     .prefetch(tf.data.experimental.AUTOTUNE))
    return titanic

In [0]:
test4=pd.read_csv(path+'test_small_WD.csv')

### Prediction

In [0]:
start_time=time.time()
answer=[]
for single_prediction in m.predict(test_input_fn6):
  answer.append(single_prediction['predictions'])
  if len(answer)>len(test4):
    print('still wrong')
    break

print("--- %s seconds ---" % (time.time() - start_time))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpclkgnw6r/model.ckpt-210000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
--- 5.1446373462677 seconds ---


In [0]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

### Test the model

In [0]:
targets=test4['rating_review'] 
def rmse(predictions, targets): 
  return np.sqrt((((np.array(predictions)).flatten() - targets) ** 2).mean())

print('RMSE of wide & deep model',rmse(answer, targets))



def last_rating(preds,y_test):
  #rmse=[]
  pred=[]
  true=[]

  for i in range(0,len(preds),3):
    pred.append(preds[i+2])
    true.append(y_test[i+2])
    #print(pred,true)
  rmse=np.sqrt(mean_squared_error(true,pred))
  mae=mean_absolute_error(y_test,preds)

    
  return rmse,mae
rmse,mae=last_rating((np.array(answer)).flatten(),targets)
print('RMSE of last rating of each user(Wide & deep)',rmse)
print('MAE of last rating of each user(Wide & deep)',mae)

def cov(preds,y_test):
  count=0
  for i in range(0,len(preds),3):
    pred_rank=np.argsort([preds[i],preds[i+1],preds[i+2]])
    true_rank=np.argsort([y_test[i],y_test[i+1],y_test[i+2]])
    if (pred_rank==true_rank).all():
      count+=1
  return count/20000
  
print('user_coverage',cov((np.array(answer)).flatten(),targets))

RMSE of wide & deep model 1.2750024392167978
RMSE of last rating of each user(Wide & deep) 1.2710563023795114
MAE of last rating of each user(Wide & deep) 1.004213202381134
user_coverage 0.2955
