<a href="https://colab.research.google.com/github/vincm1/RecSys_Implicit/blob/master/Neural_Collaborative_Filtering_(NCF).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [217]:
!pip install tensorflow-ranking



# Neural Collaborative Filtering (NCF)

This notebook tries to implement a NCF based RecSys for implicit transaction data of IT Hard- and Software purchases. Based on 6 months transaction data.

In [218]:
import os
import datetime
import zipfile

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

# Data Preprocessing

In [219]:
df_zip = zipfile.ZipFile('/content/drive/MyDrive/RecSys/Orders_Nov22_Jun23.zip')
df = pd.read_csv(df_zip.open('Bericht 1.csv'), delimiter=";")

In [220]:
df.head()

Unnamed: 0,Order Nbr,Entry Date,Entry DateTime,FiscalMonth,BranchCustomerNbr,CustomerName,BusinessUnitLevel2,KDGroup,Sku,Product Descr1,Product Descr2,ProductGroup,ProductGroupMasterDescription,ProductGroupDescription,ProductGroup2ndDescription,Sales,Qty Shipped
0,1547606,01.11.22,,2022FM11,15515778,NET-S M. CHMIELEWSKI,Export Channel (DE),,9433B9X,INK CARTRIDGE SPS,BLACK 370ML 600 DPI INKJET BULK,1037,Consumables,Ink,Supplies,-1533,-1.0
1,1547615,01.11.22,,2022FM11,15509465,DIGITAL RIVER IRELAND LIMITED,Export Channel (DE),DIRL,CB31510,LENOVO KEYBOARD PACK,FOR TAB P11-DE,641,Input Devices,Keyboards & Keypads,Printers & Peripherals,-10461,-1.0
2,1547616,01.11.22,,2022FM11,15509465,DIGITAL RIVER IRELAND LIMITED,Export Channel (DE),DIRL,CE63791,TP L13 YOGA G3 R7P 5875U 16GB,512GB SSD 13.3 WUXGA W10PDG,11,Computer Systems,Portable/Notebook Computers,System,"-1.232,13",-1.0
3,1547617,01.11.22,,2022FM11,15509465,DIGITAL RIVER IRELAND LIMITED,Export Channel (DE),DIRL,CC36816,THINKBOOK 13S G3 R5 5600U 16GB,512GB SSD 13.3 WUXGA W11P,11,Computer Systems,Portable/Notebook Computers,System,-82473,-1.0
4,1547688,01.11.22,,2022FM11,15865338,DISTRELEC SCHWEIZ AG,Export Channel (DE),,J151410,USB2.0 A TO B CABLE 5M BLACK,M/M 100PCT COPPER CONDUCTOR .,1206,Cables,Usb Cables & Adapters,Printers & Peripherals,-660,-3.0


In [221]:
# converting the customerid to string
df["BranchCustomerNbr"] = df["BranchCustomerNbr"].astype(str)
# converting the skuid to string
df["Sku"] = df["Sku"].astype(str)
# drop retour shipment
df = df[df["Qty Shipped"]> 0]

In [222]:
# dropping backlog invoices
df = df[df["Entry Date"] >= '2022-10-01 00:00:00']

In [223]:
# implementing a purchase frequency counter helper
df.loc[:, "purchase"] = 1

In [224]:
# Create a numeric user_id and artist_id column
df['BranchCustomerNbr'] = df['BranchCustomerNbr'].astype("category")
df['Sku'] = df['Sku'].astype("category")
df['bcn_id'] = df['BranchCustomerNbr'].cat.codes
df['sku_id'] = df['Sku'].cat.codes

In [225]:
grouped_df = df.groupby(["bcn_id", "sku_id"]).agg({
    "Qty Shipped": "sum",
    "purchase": "sum"
}).reset_index()

In [226]:
grouped_df_binary = grouped_df[["bcn_id", "sku_id", "purchase"]].copy()
grouped_df_binary["purchase"] = 1
grouped_df_binary

Unnamed: 0,bcn_id,sku_id,purchase
0,0,2276,1
1,0,2277,1
2,0,2796,1
3,0,4417,1
4,0,5535,1
...,...,...,...
284193,9885,1816,1
284194,9885,28315,1
284195,9886,26057,1
284196,9887,28303,1


In [227]:
users = grouped_df_binary["bcn_id"].unique()
items = grouped_df_binary["sku_id"].unique()
print(len(users), len(items))

9889 49112


## LOOCV Test set

In [228]:
def train_test_split(df, holdout_num):
    """ perform training testing split

    @param df: dataframe
    @param holdhout_num: number of items to be held out per user as testing items

    @return df_train: training data
    @return df_test testing data

    """
    # perform deep copy to avoid modification on the original dataframe
    df_train = grouped_df_binary.copy(deep=True)
    df_test = grouped_df_binary.copy(deep=True)

    # get test set
    df_test = df_test.groupby(['bcn_id']).head(holdout_num).reset_index()

    # get train set
    df_train = df_train.merge(
        df_test[['bcn_id', 'sku_id']].assign(remove=1),
        how='left'
    ).query('remove != 1').drop('remove', 1).reset_index(drop=True)

    # drop index
    df_test = df_test.drop(columns=["index"])

    # sanity check to make sure we're not duplicating/losing data
    assert len(df) == len(df_train) + len(df_test)

    return df_train, df_test

In [229]:
df_train, df_test = train_test_split(grouped_df_binary, 1)

In [230]:
len(df_train[df_train["bcn_id"] == 0])

51

In [231]:
df_test

Unnamed: 0,bcn_id,sku_id,purchase
0,0,2276,1
1,1,1424,1
2,2,1908,1
3,3,17537,1
4,4,1083,1
...,...,...,...
9884,9884,43922,1
9885,9885,1816,1
9886,9886,26057,1
9887,9887,28303,1


## Negative samples

In [232]:
def negative_sampling(bcn_ids, sku_ids, items, n_neg):
    """This function creates n_neg negative labels for every positive label

    @param user_ids: list of user ids
    @param sku_ids: list of sku ids
    @param items: unique list of sku ids
    @param n_neg: number of negative labels to sample

    @return df_neg: negative sample dataframe

    """

    neg = []
    ui_pairs = zip(bcn_ids, sku_ids)
    records = set(ui_pairs)

    # for every positive label case
    for (u, i) in records:
        # generate n_neg negative labels
        for _ in range(n_neg):
            j = np.random.choice(items)
            # resample if the movie already exists for that user
            while (u, j) in records:
                j = np.random.choice(items)
            neg.append([u, j, 0])

    # convert to pandas dataframe for concatenation later
    df_neg = pd.DataFrame(neg, columns=['bcn_id', 'sku_id', 'purchase'])

    return df_neg

In [233]:
neg_train = negative_sampling(
    bcn_ids=df_train.bcn_id.unique(),
    sku_ids=df_train.sku_id.unique(),
    items=grouped_df_binary.sku_id.unique(),
    n_neg=5
)


In [234]:
len(items)

49112

In [235]:
neg_train.sort_values(by="bcn_id", ascending=True)

Unnamed: 0,bcn_id,sku_id,purchase
10940,0,31270,0
10943,0,38104,0
10942,0,40490,0
10944,0,2377,0
10941,0,43980,0
...,...,...,...
6558,9885,28626,0
6557,9885,35029,0
6559,9885,19809,0
6556,9885,19045,0


In [236]:
len(neg_train[neg_train["bcn_id"] == 0])

5

In [237]:
# create final training and testing sets
#df_train = df_train[['bcn_id', 'sku_id']].assign(purchase=1)
df_train = pd.concat([df_train, neg_train], ignore_index=True)

In [238]:
len(df_train[(df_train["bcn_id"] == 0) & (df_train["purchase"] == 0)])

5

In [239]:
len(df_train[(df_train["purchase"] == 1) & (df_train["bcn_id"] == 1)])

28

In [240]:
len(df_train[df_train["bcn_id"] == 1])

33

In [241]:
df_train[df_train.bcn_id == 0]

Unnamed: 0,bcn_id,sku_id,purchase
0,0,2277,1
1,0,2796,1
2,0,4417,1
3,0,5535,1
4,0,6557,1
5,0,7179,1
6,0,7182,1
7,0,7223,1
8,0,7262,1
9,0,8466,1


# NCF

In [242]:
from typing import List
import tensorflow as tf
import tensorflow_ranking as tfr

import tensorflow.keras as keras
from tensorflow.keras.layers import (
    Concatenate,
    Dense,
    Embedding,
    Flatten,
    Input,
    Multiply,
)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [243]:
TOP_K = 10
N_EPOCHS = 10

In [244]:
def create_ncf(
    number_of_users: int,
    number_of_items: int,
    latent_dim_mf: int = 4,
    latent_dim_mlp: int = 32,
    reg_mf: int = 0,
    reg_mlp: int = 0.01,
    dense_layers: List[int] = [8, 4],
    reg_layers: List[int] = [0.01, 0.01],
    activation_dense: str = "relu",
) -> keras.Model:

    # input layer
    user = Input(shape=(), dtype="int32", name="bcn_id")
    item = Input(shape=(), dtype="int32", name="sku_id")

    # embedding layers
    mf_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mf,
        name="mf_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
        input_length=1,
    )
    mf_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mf,
        name="mf_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
        input_length=1,
    )

    mlp_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mlp,
        name="mlp_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
        input_length=1,
    )
    mlp_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mlp,
        name="mlp_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
        input_length=1,
    )

    # MF vector
    mf_user_latent = Flatten()(mf_user_embedding(user))
    mf_item_latent = Flatten()(mf_item_embedding(item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])

    # MLP vector
    mlp_user_latent = Flatten()(mlp_user_embedding(user))
    mlp_item_latent = Flatten()(mlp_item_embedding(item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])

    mlp_vector = mlp_cat_latent

    # build dense layers for model
    for i in range(len(dense_layers)):
        layer = Dense(
            dense_layers[i],
            activity_regularizer=l2(reg_layers[i]),
            activation=activation_dense,
            name="layer%d" % i,
        )
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])

    result = Dense(
        1, activation="sigmoid", kernel_initializer="lecun_uniform", name="purchase"
    )

    output = result(predict_layer)

    model = Model(
        inputs=[user, item],
        outputs=[output],
    )

    return model

In [245]:
n_users, n_items = df_train.shape
ncf_model = create_ncf(n_users, n_items)

ncf_model.compile(
    optimizer=Adam(),
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.TruePositives(name="tp"),
        tf.keras.metrics.FalsePositives(name="fp"),
        tf.keras.metrics.TrueNegatives(name="tn"),
        tf.keras.metrics.FalseNegatives(name="fn"),
        tf.keras.metrics.BinaryAccuracy(name="accuracy"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.keras.metrics.AUC(name="auc"),
        tfr.keras.metrics.MeanAveragePrecisionMetric(name="meanprecision")
    ],
)
ncf_model._name = "neural_collaborative_filtering"
ncf_model.summary()

Model: "neural_collaborative_filtering"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 bcn_id (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 sku_id (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 mlp_user_embedding (Embedding)  (None, 32)          9958368     ['bcn_id[0][0]']                 
                                                                                                  
 mlp_item_embedding (Embedding)  (None, 32)          96          ['sku_id[0][0]']                 
                                                                     

In [246]:
def make_tf_dataset(
    df: pd.DataFrame,
    targets: List[str],
    val_split: float = 0.1,
    batch_size: int = 512,
    seed=42,
):
    """Make TensorFlow dataset from Pandas DataFrame.
    :param df: input DataFrame - only contains features and target(s)
    :param targets: list of columns names corresponding to targets
    :param val_split: fraction of the data that should be used for validation
    :param batch_size: batch size for training
    :param seed: random seed for shuffling data - `None` won't shuffle the data"""

    n_val = round(df.shape[0] * val_split)
    if seed:
        # shuffle all the rows
        x = df.sample(frac=1, random_state=seed).to_dict("series")
    else:
        x = df.to_dict("series")
    y = dict()
    for t in targets:
        y[t] = x.pop(t)
    ds = tf.data.Dataset.from_tensor_slices((x, y))

    ds_val = ds.take(n_val).batch(batch_size)
    ds_train = ds.skip(n_val).batch(batch_size)
    return ds_train, ds_val

In [247]:
# create train and validation datasets
ds_train, ds_val = make_tf_dataset(df_train, ["purchase"])

In [248]:
%%time
# define logs and callbacks
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=2
)

train_hist = ncf_model.fit(
    ds_train,
    validation_data=ds_val,
    epochs=N_EPOCHS,
    callbacks=[tensorboard_callback, early_stopping_callback],
    verbose=1,
)

Epoch 1/10


Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x7cb4d30cf640>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x7cb4d30cf640>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code

Cause: Unable to locate the source code of <function Model.make_test_function.<locals>.test_function at 0x7cb4cd5c6290>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_test_function.<locals>.test_function at 0x7cb4cd5c6290>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
CPU times: user 1min 21s, sys: 19.1 s, total: 1min 40s
Wall time: 57.1 s


In [249]:
ds_test, _ = make_tf_dataset(df_test, ["purchase"], val_split=0, seed=None)
ncf_predictions = ncf_model.predict(ds_test)

Cause: Unable to locate the source code of <function Model.make_predict_function.<locals>.predict_function at 0x7cb4cd57c1f0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_predict_function.<locals>.predict_function at 0x7cb4cd57c1f0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


In [250]:
ncf_predictions[1]

array([0.82334673], dtype=float32)