# Foodie Recommender Data Model (V0.1)
We'll choose a set of starter features and attempt to train a two-towered recommender system

## Data Models

In [1]:
import dateutil
from pydantic import BaseModel, validator
import os
from typing import Any, List

import json


genres = {}



def save_model_data(model: BaseModel, filename: str) -> str:
    with open(filename, 'w+') as file:
        json.dump(model.json(), file)
    return filename



def create_empty_genres_file(filepath: str) -> None:
    with open(filepath, 'w+') as file:
        json.dump({}, file)
    return {}
        
        

def load_or_create_genres(genres_file='genres.json') -> dict:
    if os.path.isfile(genres_file):
        with open(genres_file, 'r') as file:
            return json.load(file)
    else:
        print('file not found', genres_file)
        return create_empty_genres_file(genres_file)


    
def save_genres(genres: dict, genres_file='genres.json') -> dict:
    with open(genres_file, 'w+') as file:
        json.dump(genres, file)
    return genres

    
        
def add_element_to_genres(element: str) -> int:
    genres = load_or_create_genres()
    dict_len = len(genres)
    if element not in genres:
        genres[element] = dict_len + 1
        genres = save_genres(genres)
    return genres[element]
        
        
        
def map_genre(genre_list: list) -> list:
    tmp = []
    for element in genre_list:
        tmp.append(add_element_to_genres(element))
    return tmp
        

    
class RestaurantUser(BaseModel):
    user_birth_date: int
    user_genres: List[Any] = [0]
    user_id: int
    user_occupation: str
    user_gender: bool  # 0: male, 1: female
    user_zip_code: int
    
    
    @validator('user_genres')
    def index_or_add(cls, v):
        assert len(v) > 0, 'Must provide list of genre > 0'
        return map_genre(v)
    
    def save(self, prefix='user') -> None:
        name = prefix + f"_{self.user_id}.json"
        return save_model_data(self, name)

    
        
class Restaurant(BaseModel):
    restaurant_id: int
    restaurant_title: str
    restaurant_genres: List[Any]
    restaurant_zip_code: int
   
    @validator('restaurant_genres')
    def index_or_add(cls, v):
        assert len(v) > 0, 'Must provide list of genre > 0'
        return map_genre(v)
        
    def save(self, prefix='restaurant') -> str:
        name = prefix + f"_{self.restaurant_id}.json"
        return save_model_data(self, name)
    
    
    
class RestaurantRating(BaseModel):
    user: RestaurantUser
    restaurant: Restaurant
    timestamp: int  # Converting all date/time to posix integer
    restaurant_rating: int
    
    @validator('timestamp')
    def convert_valid_time(cls, v: str):
        return v
    
    def save(self, prefix='rating') -> str:
        name = prefix + f"_{self.restaurant}_{self.user}_{self.timestamp}.json"
        return save_model_data(self, name)
    
    def flatten(self) -> dict:
        tmp = {
            **self.user.dict(), **self.restaurant.dict(),
            'timestamp': self.timestamp,
            'restaurant_rating': self.restaurant_rating,
        }
        return tmp

    
    

In [2]:
# Create a test user
user_profile = {
    "user_birth_date": 20220101,
    "user_genres": ['vegetarian', 'thai'],
    "user_id": 1001,
    "user_occupation": "student",
    "user_gender": 0,  # 0: male, 1: female
    "user_zip_code": 84104,
}

user = RestaurantUser(**user_profile)

# Create a test restaurant
restaurant_profile = {
    "restaurant_id": 1,
    "restaurant_title": "skinnyfats",
    "restaurant_genres": ['vegetarian', 'thai', 'healthy', 'fried'],
    "restaurant_zip_code": 84104,
}

restaurant = Restaurant(**restaurant_profile)

# Create a test rating
rating_profile = {
    "user": user,
    "restaurant": restaurant,
    "timestamp": 202201012200,  # Converting all date/time to posix integer
    "restaurant_rating": 10,
}

rating = RestaurantRating(**rating_profile)

In [3]:
# The rating object contains user and restaurant models 
# Here, we implement a helper fn flatten to make it an easier document to deal with
rating.flatten()

{'user_birth_date': 20220101,
 'user_genres': [1, 2],
 'user_id': 1001,
 'user_occupation': 'student',
 'user_gender': False,
 'user_zip_code': 84104,
 'restaurant_id': 1,
 'restaurant_title': 'skinnyfats',
 'restaurant_genres': [1, 2, 3, 4],
 'restaurant_zip_code': 84104,
 'timestamp': 202201012200,
 'restaurant_rating': 10}

In [4]:
# Our user model can output a dictionary as well with a direct call to the dict() method
user.dict()

{'user_birth_date': 20220101,
 'user_genres': [1, 2],
 'user_id': 1001,
 'user_occupation': 'student',
 'user_gender': False,
 'user_zip_code': 84104}

## Convert to Tensorflow Dataset
We'll now arbitrarily copy the data and create Tensorflow datasets to train new embeddigns models with

In [5]:
import tensorflow as tf
import numpy as np

In [7]:
# We'll convert the dict values to np arrays first

def values_to_array(input_dict: dict) -> np.array:
    tmp = {}
    for key in input_dict:
        tmp[key] = np.array([input_dict[key]])
    return tmp

# Once the data is in array format, elementwise concatenation will emulate a row
def concatenate_dicts(dicts: List[dict]) -> dict:
    
    def _fetch_values(dicts: list, key: str) -> list:
        return tuple([d[key] for d in dicts])
    
    parent = dicts[0]
    tmp = {}
    for key in parent:
        tmp[key] = np.concatenate(_fetch_values(dicts, key))
    return tmp

# Convert to tensforflow dataset
def convert_dict_to_tflow(d: dict) -> tf.data.Dataset:
    return tf.data.Dataset.from_tensor_slices(d)

        
arrayed_user = values_to_array(user.dict())
users = convert_dict_to_tflow(
    concatenate_dicts([arrayed_user]*10000)
)

arrayed_rating = values_to_array(rating.flatten())
ratings = convert_dict_to_tflow(
    concatenate_dicts([arrayed_rating]*10000)
)

## Data Manipulation for Embedding
We still need to generate embeddings for each tower.  We can use similar transformations as in the base tutorial for our data.

In [8]:
import pprint
for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)
for u in users.take(1).as_numpy_iterator():
    pprint.pprint(u)

{'restaurant_genres': array([1, 2, 3, 4]),
 'restaurant_id': 1,
 'restaurant_rating': 10,
 'restaurant_title': b'skinnyfats',
 'restaurant_zip_code': 84104,
 'timestamp': 202201012200,
 'user_birth_date': 20220101,
 'user_gender': False,
 'user_genres': array([1, 2]),
 'user_id': 1001,
 'user_occupation': b'student',
 'user_zip_code': 84104}
{'user_birth_date': 20220101,
 'user_gender': False,
 'user_genres': array([1, 2]),
 'user_id': 1001,
 'user_occupation': b'student',
 'user_zip_code': 84104}


In [30]:
# Map categorical features to embeddings for models

def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a layer that turns strings into integer indices.
    if dtype == 'string':
        index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
        # Otherwise, create a layer that turns integer values into integer indices.
    else:
        index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)

    # Prepare a `tf.data.Dataset` that only yields the feature.
    feature_ds = dataset.map(lambda x: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Encode the integer indices.
    encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())

    # Apply multi-hot encoding to the indices. The lambda function captures the
    # layer, so you can use them, or include them in the Keras Functional model later.
    return lambda feature: encoder(index(feature))


def get_text_tokenization_layer(name, dataset, max_features=1000, max_len=12):
    # Prepare a `tf.data.Dataset` that only yields the feature.
    feature_ds = dataset.map(lambda x: x[name])
    
    # Specify a vectorization layer
    vectorize_layer = tf.keras.layers.TextVectorization(
        max_tokens=max_features,
        output_mode='int',
        output_sequence_length=max_len)
    
    vectorize_layer.adapt(feature_ds.batch(64))
    
    # Create a model that can use the layer on the feature dataset
    model = tf.keras.models.Sequential()
    
    # Explicit input layer
    model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
    # vectorization layer
    model.add(vectorize_layer)
    
    # Apply the model to indices
    return lambda feature: model.predict(feature)

    
user_occ_layer = get_category_encoding_layer('user_occupation', users, 'string')
rest_title_layer = get_text_tokenization_layer('restaurant_title', ratings)

In [31]:
rest_title_layer(['text', 'test'])



array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [14]:
# Embedding Layers
embedding_dimension = 64

### Query Tower

Given a user (or set of users), yield embedding

In [None]:
user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
        vocabulary=unique_occupations, mask_token=None),
    tf.keras.layers.Embedding(len())
])