In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [3]:
search_history = pd.read_csv('expanded_search_history.csv')

In [4]:
search_history['item_id'] = search_history['search'].astype('category').cat.codes
search_history['user_id'] = search_history['user_id'].astype('category').cat.codes

In [5]:
user_ids = search_history['user_id'].values
item_ids = search_history['item_id'].values

print(f"Max user_id: {user_ids.max()}, Max item_id: {item_ids.max()}")

train_data, test_data = train_test_split(search_history, test_size=0.2, random_state=30)

train_user_ids = train_data['user_id'].values
train_item_ids = train_data['item_id'].values

test_user_ids = test_data['user_id'].values
test_item_ids = test_data['item_id'].values

Max user_id: 6486, Max item_id: 19


In [6]:
num_users = search_history['user_id'].nunique() + 1
num_items = search_history['item_id'].nunique() + 1
print(f"Number of users: {num_users}, Number of items: {num_items}")

print(train_data['user_id'].value_counts())
print(train_data['item_id'].value_counts())

input_users = tf.keras.layers.Input(shape=(1,), name='user_input')
input_items = tf.keras.layers.Input(shape=(1,), name='item_input')

embed_users = tf.keras.layers.Embedding(input_dim=num_users, output_dim=16, name='user_embedding')(input_users)
embed_items = tf.keras.layers.Embedding(input_dim=num_items, output_dim=16, name='item_embedding')(input_items)

flat_users = tf.keras.layers.Flatten()(embed_users)
flat_items = tf.keras.layers.Flatten()(embed_items)

concat = tf.keras.layers.Concatenate()([flat_users, flat_items])
dense = tf.keras.layers.Dense(8, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(concat)
dense = tf.keras.layers.BatchNormalization()(dense)
dropout = tf.keras.layers.Dropout(0.5)(dense)
dense = tf.keras.layers.Dense(4, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(dropout)
dense = tf.keras.layers.BatchNormalization()(dense)
dropout = tf.keras.layers.Dropout(0.5)(dense)
output = tf.keras.layers.Dense(1, activation='sigmoid')(dropout)

model = tf.keras.Model([input_users, input_items], output)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Number of users: 6488, Number of items: 21
user_id
227     8
5217    6
1607    6
3268    6
903     6
       ..
2618    1
6136    1
896     1
2125    1
3303    1
Name: count, Length: 5619, dtype: int64
item_id
16    471
14    434
17    429
1     428
5     422
18    421
19    420
0     419
8     412
11    410
4     407
10    406
12    403
7     401
3     400
9     400
2     394
15    389
13    387
6     387
Name: count, dtype: int64
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 item_input (InputLayer)     [(None, 1)]                  0         []                            
                                                       

In [7]:
train_dataset = tf.data.Dataset.from_tensor_slices(({'user_input': train_user_ids, 'item_input': train_item_ids}, np.ones(len(train_user_ids))))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices(({'user_input': test_user_ids, 'item_input': test_item_ids}, np.ones(len(test_user_ids))))
test_dataset = test_dataset.batch(32)

history = model.fit(train_dataset, epochs=10, validation_data=test_dataset)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
