In [1]:
# Import Tecton and other libraries
import logging
import os
import tecton
from dotenv import load_dotenv
import pandas as pd
import snowflake.connector
from datetime import datetime, timedelta
from pprint import pprint

load_dotenv()  # take environment variables from .env.
logging.getLogger('snowflake.connector').setLevel(logging.WARNING)
logging.getLogger('snowflake.snowpark').setLevel(logging.WARNING)

connection_parameters = {
    "user": os.environ['SNOWFLAKE_USER'],
    "password": os.environ['SNOWFLAKE_PASSWORD'],
    "account": os.environ['SNOWFLAKE_ACCOUNT'],
    "warehouse": "DEMO_WH",
    # Database and schema are required to create various temporary objects by tecton
    "database": "TECTON",
    "schema": "PUBLIC",
}
conn = snowflake.connector.connect(**connection_parameters)
tecton.snowflake_context.set_connection(conn) # Tecton will use this Snowflake connection for all interactive queries


# Quick helper function to query snowflake from a notebook
# Make sure to replace with the appropriate connection details for your own account
def query_snowflake(query):
    df = conn.cursor().execute(query).fetch_pandas_all()
    return df

tecton.version.summary()

Version: 99.99.99
Git Commit: 11e51bc73fa38baf0f86660959ced57e415ec010
Build Datetime: 2022-05-04T21:29:38


# Generate Training Data with Tecton

In [2]:
ws = tecton.get_workspace('apply-2022-demo')
fs = ws.get_feature_service('batch_movie_recommendations_feature_service')

In [3]:
# Preview the data directly
ratings_query = """
SELECT 
    USER_ID,
    MOVIE_ID,
    RATING,
    TIMESTAMP
FROM 
    DEV_DAVID.MOVIELENS_25M.RATINGS
where timestamp between '2020-05-19 00:00:01' and '2022-05-19 23:59:59'
"""
ratings = query_snowflake(ratings_query)
ratings.head(5)


Unnamed: 0,USER_ID,MOVIE_ID,RATING,TIMESTAMP
0,72315,96993,4,2021-08-09 13:50:01.775617
1,72315,97002,3,2021-02-13 16:11:27.775617
2,72315,97059,3,2021-02-10 06:42:10.775617
3,72315,97070,3,2021-02-10 06:41:36.775617
4,72315,97092,3,2022-03-17 12:20:53.775617


In [4]:
print(f"Size of training data: {len(ratings)}")

Size of training data: 2614105


In [5]:
training_data = fs.get_historical_features(spine=ratings_query, timestamp_key="TIMESTAMP").to_pandas()
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2614105 entries, 0 to 2614104
Data columns (total 41 columns):
 #   Column                                                Dtype         
---  ------                                                -----         
 0   MOVIE_ID                                              object        
 1   USER_ID                                               object        
 2   TIMESTAMP                                             datetime64[ns]
 3   RATING                                                int8          
 4   MOVIE_GENRE_INFO__IS_ACTION                           int8          
 5   MOVIE_GENRE_INFO__IS_ANIMATION                        int8          
 6   MOVIE_GENRE_INFO__IS_ADVENTURE                        int8          
 7   MOVIE_GENRE_INFO__IS_CHILDREN                         int8          
 8   MOVIE_GENRE_INFO__IS_COMEDY                           int8          
 9   MOVIE_GENRE_INFO__IS_CRIME                            int8          

# Train TabNet

TabNet is a deep learning architecture built to perform on tabular datasets. [Paper here](https://arxiv.org/abs/1908.07442) and [PySpark implementation here](https://github.com/dreamquark-ai/tabnet)

In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
training_data = pd.read_parquet('training_data/movielens_25m_5_5.parquet').fillna(-1.)


In [3]:
for col in training_data.columns:
    if "RATING_HISTORY" in col:
        training_data[col] = training_data[col].astype('float64')
training_data.head(5)

Unnamed: 0,MOVIE_ID,USER_ID,TIMESTAMP,RATING,MOVIE_GENRE_INFO__IS_ACTION,MOVIE_GENRE_INFO__IS_ANIMATION,MOVIE_GENRE_INFO__IS_ADVENTURE,MOVIE_GENRE_INFO__IS_CHILDREN,MOVIE_GENRE_INFO__IS_COMEDY,MOVIE_GENRE_INFO__IS_CRIME,...,USER_FANTASY_RATING_HISTORY__RATING_MEAN_730D_1D,USER_FILM_NOIR_RATING_HISTORY__RATING_MEAN_730D_1D,USER_HORROR_RATING_HISTORY__RATING_MEAN_730D_1D,USER_MUSICAL_RATING_HISTORY__RATING_MEAN_730D_1D,USER_MYSTERY_RATING_HISTORY__RATING_MEAN_730D_1D,USER_ROMANCE_RATING_HISTORY__RATING_MEAN_730D_1D,USER_SCI_FI_RATING_HISTORY__RATING_MEAN_730D_1D,USER_THRILLER_RATING_HISTORY__RATING_MEAN_730D_1D,USER_WAR_RATING_HISTORY__RATING_MEAN_730D_1D,USER_WESTERN_RATING_HISTORY__RATING_MEAN_730D_1D
0,54001,161460,2021-07-03 21:56:31.775617,5,0,0,1,0,0,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,68157,19887,2020-08-08 20:14:33.775617,4,1,0,0,0,0,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,148478,160243,2021-12-23 04:38:53.775617,3,0,1,0,0,1,0,...,3.625,-1.0,3.3,3.821429,3.5,3.584906,3.565217,3.428571,4.461538,3.4
3,1221,93658,2020-08-08 17:13:03.775617,4,0,0,0,0,0,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,138210,19886,2020-06-09 01:04:55.775617,3,0,0,0,0,0,0,...,4.126761,4.333333,3.942529,4.142857,4.25,4.234375,3.868421,3.904908,4.102564,4.181818


In [None]:
from preprocess import preprocess_for_tabnet
x, y, categorical_columns, categorical_dims = preprocess_for_tabnet(training_data, train=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y)
X_train = X_train.values
X_test = X_test.values

y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

In [None]:
features = [ col for col in x.columns] 
cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for f in features if f in categorical_columns]

# define your embedding sizes : here just a random choice
embedding_sizes = {
    "USER_ID": 64,
    "MOVIE_ID": 64,
}
cat_emb_dim = [embedding_sizes[f] for f in features if f in categorical_columns]

In [None]:
clf = TabNetRegressor(
    cat_dims=cat_dims, 
    cat_emb_dim=cat_emb_dim, 
    cat_idxs=cat_idxs,
    n_steps=2,
    gamma=1.5
)

In [None]:
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_name=['train', 'test'],
    eval_metric=['rmsle', 'mae', 'rmse', 'mse'],
    max_epochs=10,
    patience=10,
    batch_size=1024, virtual_batch_size=64,
    num_workers=0,
    drop_last=False,
) 


In [None]:
# save tabnet model
model_output_path = "models/batch_movie_recommender_5_9/batch_movie_recommender_5_9"
saved_path = clf.save_model(model_output_path)
pickle.dump(features, open('models/batch_movie_recommender_5_9/schema.p','wb'))

In [None]:
features_and_importances = sorted([ (features[i], importance) for i, importance in  enumerate(clf.feature_importances_)], key=lambda x: x[1], reverse=True)
features_and_importances

In [None]:
pickle.dump(features_and_importances, open('feature_importances.p', 'wb'))