In [156]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, KBinsDiscretizer

In [3]:
books = pd.read_parquet("books_data/books.parquet")
users = pd.read_parquet("books_data/users.parquet")
ratings = pd.read_parquet("books_data/ratings.parquet")

In [4]:
users.dtypes

user_id                 int64
location               object
age                   float64
city                   object
state                  object
country                object
signup_date    datetime64[ns]
dtype: object

In [41]:
books["Category"].nunique()

6447

In [5]:
books.head(10)

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,Summary,Language,Category,created_at
948518,0000913154,The Way Things Work: An Illustrated Encycloped...,C. van Amerongen (translator),1967,Simon & Schuster,"Scientific principles, inventions, and chemica...",en,Technology & Engineering,1967-01-01 12:00:00
826650,0001055607,Cereus Blooms At Night,Shani Mootoo,1994,Fairmount Books Ltd Remainders,"When Mala, old and notoriously crazy, arrives ...",en,Adult child abuse victims,1994-01-01 12:00:00
908526,0001061127,CHESS FOR YOUNG BEGINNERS,William T. McLeod,1975,HarperCollins Publishers,A step by step guide to playing chess,en,Chess,1975-01-01 12:00:00
1025127,0001374362,When It's Time for Bed (Collins Baby & Toddler...,Nick Butterworth,1994,Collins,Shows baby and his animal friends preparing fo...,en,Animals,1994-01-01 12:00:00
935227,0001711253,The Big Honey Hunt,Stan Berenstein,1942,HarperCollins Publishers,Father Bear takes Small Bear on a honey hunt. ...,en,Bears,1942-01-01 12:00:00
1028471,000171421X,It's Not Easy Being a Bunny (A Beginner Book),Marilyn Sadler,1984,HarperCollins Publishers,P.J. Funnybunny did not like being a bunny.,en,Animals,1984-01-01 12:00:00
907750,000184251X,February's Road,John Verney,1987,HarperCollins Publishers,The new road is to go right through the Callen...,en,"Children's stories, English",1987-01-01 12:00:00
917054,0001850121,A place called Lantern Light,Ellen Miller,1975,Collins,They were nearing the end of their long journey.,en,Children's stories,1975-01-01 12:00:00
907137,0001856367,The Collins Book of Ballet and Dance,Jean Ure,1996,HarperCollins Publishers,A collection of short stories about young peop...,en,Ballet,1996-01-01 12:00:00
999997,0001935968,The Brambly Hedge Treasury,Jill Barklem,1991,HarperCollins Publishers,For this is the home of mice of Brambly Hills&...,en,Country life,1991-01-01 12:00:00


In [6]:
ratings.dtypes

user_id                      int64
isbn                        object
rating                       int64
rating_timestamp    datetime64[ns]
dtype: object

In [7]:
from feast import FeatureStore

In [119]:
store = FeatureStore(repo_path="feast_repo/feature_repo")

In [213]:
ratings.groupby("user_id").()

Unnamed: 0_level_0,isbn,rating,rating_timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,0195153448,0,2022-08-04 16:03:16.862
8,0002005018,5,2020-08-15 16:00:36.000
9,0440234743,0,2022-09-02 01:44:06.535
10,1841721522,0,2020-07-04 18:18:19.970
12,1879384493,10,2020-07-23 04:19:17.666
...,...,...,...
278846,0060809833,8,2020-03-06 11:36:34.759
278849,0445210214,0,2020-05-11 14:32:33.494
278851,1558531025,8,2020-07-28 17:31:12.181
278852,0449907597,8,2021-03-15 17:04:45.672


In [121]:
ratings["rating_timestamp"].max()

Timestamp('2023-01-01 23:59:07.561000')

In [122]:
ratings["rating_timestamp"].min()

Timestamp('2020-01-01 00:01:29.484000')

In [333]:
features_raw = store.get_historical_features(entity_df=ratings, features=store.get_feature_service("model_v1")).to_df()
features = features_raw

Using rating_timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.


In [334]:
features.head(10)

Unnamed: 0,user_id,isbn,rating,rating_timestamp,book_title,book_author,year_of_publication,publisher,Summary,Language,Category,location,age,city,state,country
0,151265,0425045927,9,2022-08-24 07:44:02.092000+00:00,,,,,,,,"las vegas, nevada, usa",67.0,las vegas,nevada,usa
1,151265,067976402X,0,2021-03-16 04:18:45.561000+00:00,Snow Falling on Cedars,David Guterson,1995.0,Vintage Books USA,"In 1954, Ishmael Chambers, a local reporter wh...",en,Fiction,"las vegas, nevada, usa",67.0,las vegas,nevada,usa
2,265834,0802133908,0,2020-12-14 00:00:56.926000+00:00,Pedro Paramo,Juan Rulfo,1994.0,Grove Press,"In one such village of the mind, Comala, he se...",en,Fiction,"madrid, madrid, spain",51.0,madrid,madrid,spain
3,265834,0140153195,0,2022-09-12 03:21:35.054000+00:00,Justine (Alexandria Quartet),Lawrence Durrell,1991.0,Penguin Books,On the eve of World War II in the Egyptian cit...,en,Fiction,"madrid, madrid, spain",51.0,madrid,madrid,spain
4,265834,8483062461,0,2021-08-18 15:15:56.585000+00:00,,,,,,,,"madrid, madrid, spain",51.0,madrid,madrid,spain
5,265834,8408022563,0,2020-04-06 19:22:17.430000+00:00,,,,,,,,"madrid, madrid, spain",51.0,madrid,madrid,spain
6,265834,8420478725,0,2020-04-02 12:49:01.442000+00:00,,,,,,,,"madrid, madrid, spain",51.0,madrid,madrid,spain
7,265834,0141180633,0,2020-05-24 13:49:51.878000+00:00,Vineland,Thomas Pynchon,1997.0,Penguin Books,Zoyd Wheeler&#39;s old nemesis arrives in Vine...,en,Fiction,"madrid, madrid, spain",51.0,madrid,madrid,spain
8,265834,0345354907,0,2021-08-23 15:53:38.991000+00:00,,,,,,,,"madrid, madrid, spain",51.0,madrid,madrid,spain
9,265834,2070360342,0,2021-06-24 22:38:04.729000+00:00,Les Caves Du Vatican,AndrÃ© Gide,1994.0,Folio,Avec cette &quot;sotie&quot; de 1914 qui scand...,fr,Fiction,"madrid, madrid, spain",51.0,madrid,madrid,spain


In [335]:
encoder = OrdinalEncoder()
# transform data
features["Category"] = encoder.fit_transform(features[["Category"]])
features["Language"] = encoder.fit_transform(features[["Language"]])
features["state"] = encoder.fit_transform(features[["state"]])
features["city"] = encoder.fit_transform(features[["city"]])
features["country"] = encoder.fit_transform(features[["country"]])
features["book_author"] = encoder.fit_transform(features[["book_author"]])
features["publisher"] = encoder.fit_transform(features[["publisher"]])

In [336]:
one_hot = OneHotEncoder(min_frequency=0.01, handle_unknown='ignore')
transformed = one_hot.fit_transform(features[["Category", "Language", "state", "city", "country", "book_author", "publisher"]]).astype("int64").toarray()
ohe_df = pd.DataFrame(transformed, columns=one_hot.get_feature_names_out())
ohe_df.head(20)

Unnamed: 0,Category_1257.0,Category_3538.0,Category_4497.0,Category_nan,Category_infrequent_sklearn,Language_6.0,Language_nan,Language_infrequent_sklearn,state_9.0,state_67.0,...,publisher_744.0,publisher_747.0,publisher_917.0,publisher_3293.0,publisher_5715.0,publisher_5920.0,publisher_6871.0,publisher_8038.0,publisher_nan,publisher_infrequent_sklearn
0,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [337]:
features = pd.concat([features, ohe_df], axis=1)

In [338]:
discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
features["age_binned"] = discretizer.fit_transform(features[["age"]]).astype("int64")

In [339]:
features = features.sort_values(['user_id', 'rating_timestamp'], ascending=[True, True])

In [340]:
features = features.set_index(keys=['user_id', 'rating_timestamp'])

In [363]:
features.groupby(by="user_id", group_keys=True)["Category"].shift(1).rolling(1).sum().rename("last_category").reset_index()

Unnamed: 0,index,last_category
0,0,
1,1,
2,2,
3,3,3538.0
4,4,103.0
...,...,...
1030477,1030477,
1030478,1030478,3538.0
1030479,1030479,3538.0
1030480,1030480,3538.0


In [362]:
features["last_category"] = features.groupby(by="user_id", group_keys=True)["Category"].shift(1).rolling(3).agg(lambda rows: rows[2]).rename("last_category")
features["last_2_category"] = features.groupby(by="user_id", group_keys=True)["Category"].shift(1).rolling(3).agg(lambda rows: rows[1]).rename("last_2_category")
features["last_3_category"] = features.groupby(by="user_id", group_keys=True)["Category"].shift(1).rolling(3).agg(lambda rows: rows[0]).rename("last_3_category")

KeyError: 2

In [343]:
features[["Category", "last_category"]].head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Category,last_category
user_id,rating_timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1
2,2022-08-04 16:03:16.862000+00:00,5999.0,
8,2020-08-04 07:22:59.858000+00:00,,
8,2020-08-04 12:31:50.798000+00:00,3538.0,
8,2020-08-15 16:00:36+00:00,103.0,3538.0
8,2020-11-03 11:22:49.083000+00:00,,103.0
8,2020-12-04 10:12:55.667000+00:00,,
8,2020-12-05 03:56:53.049000+00:00,4111.0,
8,2021-01-08 18:51:55.942000+00:00,,4111.0
8,2021-07-15 02:44:56.907000+00:00,3538.0,
8,2021-08-16 16:01:37.436000+00:00,5090.0,3538.0


In [348]:
features = features.reset_index()

In [349]:
# Instantiation
train_X_all = features[features["rating_timestamp"] <= "2022-01-01"]
train_Y = train_X_all["rating"]
train_X = train_X_all.drop(columns=["rating", "isbn", "rating_timestamp", "age", "state", "city"])

test_X_all = features[features["rating_timestamp"] > "2022-01-01"]
test_Y = test_X_all["rating"]
test_X = test_X_all.drop(columns=["rating", "isbn", "rating_timestamp", "age", "state", "city"])

In [198]:
books["created_at"].min()

Timestamp('1901-01-01 12:00:00')

In [199]:
books[books["isbn"] == "0887841740"]

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,Summary,Language,Category,created_at


In [350]:
# train_features["Category"] = train_features["Category"].astype("category")
# train_features["Language"] = train_features["Language"].astype("category")
# train_features["state"] = train_features["state"].astype("category")
# train_features["city"] = train_features["city"].astype("category")
# train_features["country"] = train_features["country"].astype("category")
# train_features["book_author"] = train_features["book_author"].astype("category")
# train_features["publisher"] = train_features["publisher"].astype("category")
train_features = train_X.drop(columns=["book_title", "Summary", "location"])

In [355]:
# test_features["Category"] = test_features["Category"].astype("category")
# test_features["Language"] = test_features["Language"].astype("category")
# test_features["state"] = test_features["state"].astype("category")
# test_features["city"] = test_features["city"].astype("category")
# test_features["country"] = test_features["country"].astype("category")
# test_features["book_author"] = test_features["book_author"].astype("category")
# test_features["publisher"] = test_features["publisher"].astype("category")
test_features = test_X.drop(columns=["book_title", "Summary", "location"])

In [202]:
test_features.dtypes

user_id                           int64
year_of_publication             float64
Category_1257.0                   int64
Category_3538.0                   int64
Category_4497.0                   int64
                                 ...   
publisher_6871.0                  int64
publisher_8038.0                  int64
publisher_nan                     int64
publisher_infrequent_sklearn      int64
age_binned                        int64
Length: 67, dtype: object

In [358]:
test_features

Unnamed: 0,user_id,book_author,year_of_publication,publisher,Language,Category,country,Category_1257.0,Category_3538.0,Category_4497.0,...,publisher_917.0,publisher_3293.0,publisher_5715.0,publisher_5920.0,publisher_6871.0,publisher_8038.0,publisher_nan,publisher_infrequent_sklearn,age_binned,last_category
0,2,38792.0,2002.0,5505.0,6.0,5999.0,383.0,0,0,0,...,0,0,0,0,0,0,0,1,1,
11,8,20401.0,1999.0,2621.0,6.0,4843.0,64.0,0,0,0,...,0,0,0,0,0,0,0,1,3,2837.0
12,8,11818.0,1988.0,2471.0,6.0,3538.0,64.0,0,1,0,...,0,0,0,0,0,0,0,1,3,4843.0
13,8,,,,,,64.0,0,0,0,...,0,0,0,0,0,0,1,0,3,3538.0
14,8,7448.0,1991.0,3353.0,6.0,10.0,64.0,0,0,0,...,0,0,0,0,0,0,0,1,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030469,278851,24255.0,1986.0,8392.0,6.0,4497.0,383.0,0,0,1,...,0,0,0,0,0,0,0,1,2,
1030470,278851,,,,,,383.0,0,0,0,...,0,0,0,0,0,0,1,0,2,4497.0
1030471,278851,38880.0,1994.0,747.0,6.0,3538.0,383.0,0,1,0,...,0,0,0,0,0,0,0,0,2,
1030472,278851,,,,,,383.0,0,0,0,...,0,0,0,0,0,0,1,0,2,3538.0


In [352]:
train_features

Unnamed: 0,user_id,book_author,year_of_publication,publisher,Language,Category,country,Category_1257.0,Category_3538.0,Category_4497.0,...,publisher_917.0,publisher_3293.0,publisher_5715.0,publisher_5920.0,publisher_6871.0,publisher_8038.0,publisher_nan,publisher_infrequent_sklearn,age_binned,last_category
1,8,,,,,,64.0,0,0,0,...,0,0,0,0,0,0,1,0,3,
2,8,1984.0,1991.0,6146.0,6.0,3538.0,64.0,0,1,0,...,0,0,0,0,0,0,0,1,3,
3,8,48571.0,2001.0,3350.0,6.0,103.0,64.0,0,0,0,...,0,0,0,0,0,0,0,1,3,3538.0
4,8,,,,,,64.0,0,0,0,...,0,0,0,0,0,0,1,0,3,103.0
5,8,,,,,,64.0,0,0,0,...,0,0,0,0,0,0,1,0,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030476,278854,,,,,,383.0,0,0,0,...,0,0,0,0,0,0,1,0,3,3538.0
1030477,278854,49290.0,1988.0,747.0,6.0,3538.0,383.0,0,1,0,...,0,0,0,0,0,0,0,0,3,
1030478,278854,56144.0,2000.0,744.0,6.0,3538.0,383.0,0,1,0,...,0,0,0,0,0,0,0,0,3,3538.0
1030479,278854,35199.0,1990.0,4032.0,6.0,3538.0,383.0,0,1,0,...,0,0,0,0,0,0,0,1,3,3538.0


In [353]:
xgb_v1 = xgb.XGBRegressor(objective='reg:squarederror', random_state=233)
 
# Fitting the model
xgb_v1.fit(train_features[["user_id", "age_binned"]], train_Y)

In [357]:
# Full model
xgb_r = xgb.XGBRegressor(objective='reg:squarederror', random_state=233)
 
# Fitting the model
xgb_r.fit(train_features, train_Y)

In [57]:
from sklearn.metrics import mean_squared_error as MSE

pred = xgb_r.predict(test_features)
# RMSE Computation
rmse = np.sqrt(MSE(test_Y, pred))
print("RMSE : % f" %(rmse))

RMSE :  3.540545


In [211]:
from sklearn.metrics import mean_squared_error as MSE

pred = xgb_v1.predict(test_features[["user_id", "age_binned"]])
# RMSE Computation
rmse = np.sqrt(MSE(test_Y, pred))
print("RMSE : % f" %(rmse))

RMSE :  3.578522


In [359]:
from sklearn.metrics import mean_squared_error as MSE

pred = xgb_r.predict(test_features)
# RMSE Computation
rmse = np.sqrt(MSE(test_Y, pred))
print("RMSE : % f" %(rmse))

RMSE :  3.577680


In [72]:
pred

array([3.118087 , 1.8542459, 2.3469923, ..., 3.317025 , 3.2924807,
       2.655191 ], dtype=float32)