In [4]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf

import time

In [0]:
path="/content/drive/My Drive/yelp_final_data/"

### In this notebook, we train and test the Wide and Deep model on the large dataset. 
Outline of the notebook:
1. Train W&D model then test
2. Train Wide part separately then test
3. Train Deep part separately then test
4. Train the W&D model and test on the segmented user and business data

### Build model using tensorflow.

We have tried different features we use in the Wide and Deep part of the network; in this notebook, we only demonstrate our final choices.

In [0]:
CATEGORICAL_COLUMNS = ["user_id", "business_id",'city','state','categories']
CONTINUOUS_COLUMNS = ['index',"average_stars",'useful_review','funny_review','cool_review','compliment_more','compliment_cute',
'num_friends','stars','useful',
'funny','cool','fans','compliment_funny']
#CONTINUOUS_COLUMNS=[]
SURVIVED_COLUMN = "rating_review"

def build_estimator(model_dir,df,model_type=None):
  """Build an estimator."""
  # Categorical columns
  user_id = tf.contrib.layers.sparse_column_with_keys(column_name="user_id",
                                                     keys=[str(i) for i in list(np.unique(df['user_id']))])
  business_id = tf.contrib.layers.sparse_column_with_keys(column_name="business_id",
                                                   keys=[str(i) for i in list(np.unique(df['business_id']))])
  city= tf.contrib.layers.sparse_column_with_keys(column_name="city",
                                                     keys=[str(i) for i in list(np.unique(df['city']))])
  state= tf.contrib.layers.sparse_column_with_keys(column_name="state",
                                                     keys=[str(i) for i in list(np.unique(df['state']))])

  city2 = tf.contrib.layers.sparse_column_with_hash_bucket(
      "city", hash_bucket_size=len(np.unique(df['city'])))
  state2 = tf.contrib.layers.sparse_column_with_hash_bucket(
      "state", hash_bucket_size=700)
  categories = tf.contrib.layers.sparse_column_with_hash_bucket(
      "categories", hash_bucket_size=100)


  # Continuous columns
  average_stars = tf.contrib.layers.real_valued_column("average_stars")
  useful_review = tf.contrib.layers.real_valued_column("useful_review")
  compliment_more = tf.contrib.layers.real_valued_column("compliment_more")
  compliment_cute = tf.contrib.layers.real_valued_column("compliment_cute")
  num_friends = tf.contrib.layers.real_valued_column("num_friends")
  funny_review = tf.contrib.layers.real_valued_column("funny_review")
  cool_review = tf.contrib.layers.real_valued_column("cool_review")
  stars = tf.contrib.layers.real_valued_column("stars")
  #freq_business = tf.contrib.layers.real_valued_column("freq_business")
  #freq_user = tf.contrib.layers.real_valued_column("freq_user")
  useful = tf.contrib.layers.real_valued_column("useful")
  funny = tf.contrib.layers.real_valued_column("funny")
  cool = tf.contrib.layers.real_valued_column("cool")
  fans = tf.contrib.layers.real_valued_column("fans")
  compliment_funny = tf.contrib.layers.real_valued_column("compliment_funny")
  
  # Transformations.
 
   # Wide columns and deep columns.
  wide_columns=[average_stars,useful_review,compliment_more,compliment_cute,num_friends,city2,
                funny_review,cool_review,stars,state2,categories,
                tf.contrib.layers.crossed_column(
                      [city, state],
                      hash_bucket_size=int(1e4))]


  deep_columns = [
      tf.contrib.layers.embedding_column(user_id, dimension=32),
      tf.contrib.layers.embedding_column(business_id, dimension=32),
      tf.contrib.layers.embedding_column(city, dimension=8),
      #tf.contrib.layers.embedding_column(state, dimension=8),
      average_stars,
      compliment_more,
      stars,
      #freq_business,
      #freq_user,
      useful,
      funny,
      cool,
      fans,
      compliment_funny,
      
 
  ]
  optimazer=tf.train.ProximalAdagradOptimizer(learning_rate=0.1,
                                              l1_regularization_strength=0.001,
                                              l2_regularization_strength=0.001)
    # learning_rate=tf.exponential_decay(
    #     learning_rate=0.1,
    #     global_step=tf.get_global_step(),
    #     decay_steps=1000,
    #     decay_rate=0.96))
    
  if model_type=='DEEP':
    # return tf.contrib.learn.DNNRegressor(feature_columns=deep_columns,
    #                                     hidden_units=[100,50])
    return tf.estimator.DNNRegressor(feature_columns=deep_columns,hidden_units=[100,50,50])

  if model_type=='WD':

    return tf.estimator.DNNLinearCombinedRegressor(
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=[100,100,50,25],
        batch_norm=True,
        dnn_dropout=0.5,
        dnn_optimizer=optimazer
      )
    
  # tf.train.ProximalAdagradOptimizer(
  #   learning_rate=0.1,
  #   l1_regularization_strength=0.001,
  #   l2_regularization_strength=0.001)
# To apply learning rate decay, you can set dnn_optimizer to a callable:
    
  
    
  if model_type=='Linear':
   return tf.estimator.LinearRegressor(feature_columns=wide_columns)
    # return tf.contrib.learn.LinearRegressor(
            
    #         feature_columns=wide_columns)


  # return tf.contrib.learn.DNNLinearCombinedClassifier(
  #       linear_feature_columns=wide_columns,
  #       dnn_feature_columns=deep_columns,
  #       dnn_hidden_units=[100, 50])

def input_fn(df, train=False):
  """Input builder function."""
  # Creates a dictionary mapping from each continuous feature column name (k) to
  # the values of that column stored in a constant Tensor.
  if len(CONTINUOUS_COLUMNS)>0:
    continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
  # Creates a dictionary mapping from each categorical feature column name (k)
  # to the values of that column stored in a tf.SparseTensor.
  else:
    continuous_cols=[]

  if len(CATEGORICAL_COLUMNS)>0:
    categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      dense_shape=[df[k].size, 1])
                        for k in CATEGORICAL_COLUMNS}
  else:
    categorical_cols=[]
  # Merges the two dictionaries into one.
  feature_cols = dict(continuous_cols)
  feature_cols.update(categorical_cols)
  # Converts the label column into a constant Tensor.
  if train:
    label = tf.constant(df[SURVIVED_COLUMN].values)
      # Returns the feature columns and the label.
    return feature_cols, label
  else:
    return feature_cols

In [0]:
def train_input_fn6():
    titanic_file=path+'new_train.csv'
    titanic = tf.data.experimental.make_csv_dataset(
        titanic_file, batch_size=128,
        label_name="rating_review")
    titanic_batches = (
        titanic.cache().repeat().shuffle(500)
        .prefetch(tf.data.experimental.AUTOTUNE))
    return titanic_batches

In [0]:
tf.logging.set_verbosity(tf.logging.INFO)

In [0]:
df_train = pd.read_csv(
      tf.gfile.Open(path+"new_train.csv"),
      skipinitialspace=True)

In [21]:
#tf.logging.set_verbosity(tf.logging.INFO)
# df_train = pd.read_csv(
#       tf.gfile.Open(path+"new_train.csv"),
#       skipinitialspace=True)
# # df_test = pd.read_csv(
# #     tf.gfile.Open(path+"test4.csv"),
# #     skipinitialspace=True)

def create_model_dir(model_type):
  return 'models/model_' + model_type + '_' + str(int(time.time()))

model_type='WD'
model_dir=create_model_dir(model_type)
print(model_dir)
print("model directory = %s" % model_dir)

m = build_estimator(model_dir=model_dir,df=df_train,model_type=model_type)
m = m.train(input_fn=train_input_fn6,  steps=150000)

models/model_WD_1576700251
model directory = models/model_WD_1576700251
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp95_bbwn2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fdd0e95a550>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worke

In [72]:
m

<tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor at 0x7fdd0e95beb8>

In [0]:
def test_input_fn6():
    titanic_file=path+'new_test.csv'
    
    titanic = tf.data.experimental.make_csv_dataset(
        titanic_file, batch_size=128,
        num_epochs=1,
        shuffle=False,
        label_name="rating_review")
    # titanic_batches = (
    #     titanic.cache()
    #     .prefetch(tf.data.experimental.AUTOTUNE))
    return titanic

In [0]:
#test4=pd.read_csv(path+'test5_str.csv')
test4=pd.read_csv(path+'new_test.csv')

In [0]:
len(test4)

844095

#### Prediction

In [22]:
start_time=time.time()
answer=[]
for single_prediction in m.predict(test_input_fn6):
  answer.append(single_prediction['predictions'])
  if len(answer)>len(test4):
    print('still wrong')
    break

print("--- %s seconds ---" % (time.time() - start_time))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp95_bbwn2/model.ckpt-150000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
--- 76.66633772850037 seconds ---


In [0]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [16]:
len(np.unique(df_train['user_id']))

281365

In [24]:
len(targets)

844095

#### Get result using a trained W&D model.

In [23]:
targets=test4['rating_review'] #WD 150000 new_test #最后是这个
def rmse(predictions, targets): 
  return np.sqrt((((np.array(predictions)).flatten() - targets) ** 2).mean())

print('RMSE of Wide and Deep',rmse(answer, targets))



def last_rating(preds,y_test):
  #rmse=[]
  pred=[]
  true=[]

  for i in range(0,len(preds),3):
    pred.append(preds[i+2])
    true.append(y_test[i+2])
    #print(pred,true)
  rmse=np.sqrt(mean_squared_error(true,pred))
  mae=mean_absolute_error(y_test,preds)

    
  return rmse,mae
rmse,mae=last_rating((np.array(answer)).flatten(),targets)
print('RMSE of last rating of each user(Wide and Deep)',rmse)
print('MAE of last rating of each user(Wide and Deep)',mae)

def cov(preds,y_test):
  count=0
  for i in range(0,len(preds),3):
    pred_rank=np.argsort([preds[i],preds[i+1],preds[i+2]])
    true_rank=np.argsort([y_test[i],y_test[i+1],y_test[i+2]])
    if (pred_rank==true_rank).all():
      count+=1
  return float(count)/len(np.unique(df_train['user_id']))
  
print('user_coverage',cov((np.array(answer)).flatten(),targets))

RMSE of Wide and Deep 1.1989847493679322
RMSE of last rating of each user(Wide and Deep) 1.19841698167923
MAE of last rating of each user(Wide and Deep) 0.961296367259051
user_coverage 0.3058767081904288


#### Get result using a trained Deep model(no Wide part).

In [0]:
targets=test4['rating_review'] #deep 130000 new_test #use this one
def rmse(predictions, targets): 
  return np.sqrt((((np.array(predictions)).flatten() - targets) ** 2).mean())

print('RMSE of Deep',rmse(answer, targets))



def last_rating(preds,y_test):
  #rmse=[]
  pred=[]
  true=[]

  for i in range(0,len(preds),3):
    pred.append(preds[i+2])
    true.append(y_test[i+2])
    #print(pred,true)
  rmse=np.sqrt(mean_squared_error(true,pred))
  mae=mean_absolute_error(y_test,preds)

    
  return rmse,mae
rmse,mae=last_rating((np.array(answer)).flatten(),targets)
print('RMSE of last rating of each user(Deep)',rmse)
print('MAE of last rating of each user(Deep)',mae)

def cov(preds,y_test):
  count=0
  for i in range(0,len(preds),3):
    pred_rank=np.argsort([preds[i],preds[i+1],preds[i+2]])
    true_rank=np.argsort([y_test[i],y_test[i+1],y_test[i+2]])
    if (pred_rank==true_rank).all():
      count+=1
  return float(count)/len(np.unique(df_train['user_id']))
  
print('user_coverage',cov((np.array(answer)).flatten(),targets))

RMSE of Deep 1.3643451321328977
RMSE of last rating of each user(Deep) 1.3630884798554996
MAE of last rating of each user(Deep) 1.0499056417866814
user_coverage 0.255980665683365


#### Get result using a trained Wide model(no deep part).

In [0]:
targets=test4['rating_review'] #linear 30000 new_test #use this one
def rmse(predictions, targets): 
  return np.sqrt((((np.array(predictions)).flatten() - targets) ** 2).mean())

print('RMSE of linear model',rmse(answer, targets))



def last_rating(preds,y_test):
  #rmse=[]
  pred=[]
  true=[]

  for i in range(0,len(preds),3):
    pred.append(preds[i+2])
    true.append(y_test[i+2])
    #print(pred,true)
  rmse=np.sqrt(mean_squared_error(true,pred))
  mae=mean_absolute_error(y_test,preds)

    
  return rmse,mae
rmse,mae=last_rating((np.array(answer)).flatten(),targets)
print('RMSE of last rating of each user(linear)',rmse)
print('MAE of last rating of each user(linear)',mae)

def cov(preds,y_test):
  count=0
  for i in range(0,len(preds),3):
    pred_rank=np.argsort([preds[i],preds[i+1],preds[i+2]])
    true_rank=np.argsort([y_test[i],y_test[i+1],y_test[i+2]])
    if (pred_rank==true_rank).all():
      count+=1
  return float(count)/len(np.unique(df_train['user_id']))
  
print('user_coverage',cov((np.array(answer)).flatten(),targets))

RMSE of linear model 1.1650703832549998
RMSE of last rating of each user(linear) 1.1644332723255844
MAE of last rating of each user(linear) 0.9162212331133986
user_coverage 0.30446572956835427


### Predict on different user segmentation


In [0]:
def test_input_fn7():
    titanic_file=path+'popular_user.csv'
    
    titanic = tf.data.experimental.make_csv_dataset(
        titanic_file, batch_size=128,
        num_epochs=1,
        shuffle=False,
        label_name="rating_review")
    # titanic_batches = (
    #     titanic.cache()
    #     .prefetch(tf.data.experimental.AUTOTUNE))
    return titanic

In [0]:
test5=pd.read_csv(path+'popular_user.csv')

In [48]:
start_time=time.time()
answer=[]
for single_prediction in m.predict(test_input_fn7):
  answer.append(single_prediction['predictions'])
  

print("--- %s seconds ---" % (time.time() - start_time))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp95_bbwn2/model.ckpt-150000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
--- 20.03504776954651 seconds ---


In [0]:
def rmse(predictions, targets): 
  return np.sqrt((((np.array(predictions)).flatten() - targets) ** 2).mean())

def last_rating(preds,y_test):
  #rmse=[]
  pred=[]
  true=[]

  for i in range(0,len(preds),3):
    pred.append(preds[i+2])
    true.append(y_test[i+2])
    #print(pred,true)
  rmse=np.sqrt(mean_squared_error(true,pred))
  mae=mean_absolute_error(y_test,preds)

    
  return rmse,mae
def cov(preds,y_test,num):
  count=0
  for i in range(0,len(preds),3):
    pred_rank=np.argsort([preds[i],preds[i+1],preds[i+2]])
    true_rank=np.argsort([y_test[i],y_test[i+1],y_test[i+2]])
    if (pred_rank==true_rank).all():
      count+=1
  return float(count)/num
  

In [45]:
print('Unpopular User')
targets=test5['rating_review'] #linear 30000 new_test #use this one

print('RMSE',rmse(answer, targets))



rmse,mae=last_rating((np.array(answer)).flatten(),targets)
print('RMSE of last rating of each user',rmse)
print('MAE of last rating of each user',mae)


print('user_coverage',cov((np.array(answer)).flatten(),targets,143289))

Unpopular User
RMSE 1.2068757124283576
RMSE of last rating of each user 1.2092507013152447
MAE of last rating of each user 0.9683649417227079
user_coverage 0.3148811143911954


In [40]:
print('Midpopular User')
targets=test5['rating_review'] 

print('RMSE',rmse(answer, targets))



rmse,mae=last_rating((np.array(answer)).flatten(),targets)
print('RMSE of last rating of each user',rmse)
print('MAE of last rating of each user',mae)


print('user_coverage',cov((np.array(answer)).flatten(),targets,78897))

Midpopular User
RMSE 1.204787150647708
RMSE of last rating of each user 1.2056538446000455
MAE of last rating of each user 0.9677705804626177
user_coverage 0.31178625296272355


In [50]:
print('Popular User')
targets=test5['rating_review'] 

print('RMSE',rmse(answer, targets))



rmse,mae=last_rating((np.array(answer)).flatten(),targets)
print('RMSE of last rating of each user',rmse)
print('MAE of last rating of each user',mae)


print('user_coverage',cov((np.array(answer)).flatten(),targets,59179))

Popular User
RMSE 1.1717519273131942
RMSE of last rating of each user 1.174809832416539
MAE of last rating of each user 0.9355499882150535
user_coverage 0.3121715473394278


### Predict on different business segmentation

In [0]:
def test_input_fn7():
    titanic_file=path+'midpopular_business.csv'
    
    titanic = tf.data.experimental.make_csv_dataset(
        titanic_file, batch_size=128,
        num_epochs=1,
        shuffle=False,
        label_name="rating_review")
    # titanic_batches = (
    #     titanic.cache()
    #     .prefetch(tf.data.experimental.AUTOTUNE))
    return titanic

In [0]:
test5=pd.read_csv(path+'midpopular_business.csv')

In [64]:
start_time=time.time()
answer=[]
for single_prediction in m.predict(test_input_fn7):
  answer.append(single_prediction['predictions'])
  

print("--- %s seconds ---" % (time.time() - start_time))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp95_bbwn2/model.ckpt-150000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
--- 15.896955966949463 seconds ---


In [0]:
def rmse(predictions, targets): 
  return np.sqrt((((np.array(predictions)).flatten() - targets) ** 2).mean())


In [61]:
print('Unpopular Business')
targets=test5['rating_review'] #linear 30000 new_test #use this one

print('RMSE',rmse(answer, targets))


Unpopular Business
RMSE 1.2397345836405833


In [66]:
print('Midpopular Business')
targets=test5['rating_review'] 

print('RMSE',rmse(answer, targets))


Midpopular Business
RMSE 1.260606719032065


In [56]:
print('Popular Business')
targets=test5['rating_review'] 

print('RMSE',rmse(answer, targets))


Popular Business
RMSE 1.1786618155754103
