In [1]:
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

In [4]:
# Define Encoder Classes
class UserEncoder(nn.Module):
    def __init__(self, user_feature_dim, embedding_dim):
        super(UserEncoder, self).__init__()

        # Architecture of User encoder
        self.fc = nn.Sequential(
            nn.Linear(user_feature_dim, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, embedding_dim))

    def forward(self, user_features):
        user_embedding = self.fc(user_features)

        # Normalize for similarity
        return F.normalize(user_embedding, p=2, dim=-1)

class BookEncoder(nn.Module):
    def __init__(self, book_feature_dim, embedding_dim):
        super(BookEncoder, self).__init__()

        # Architecture of Book Encoder
        self.fc = nn.Sequential(
            nn.Linear(book_feature_dim, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, embedding_dim))

    def forward(self, book_features):
        book_embedding = self.fc(book_features)

        # Normalize for similarity
        return F.normalize(book_embedding, p=2, dim=-1)

# Simply combine both into one class
class DualEncoder(nn.Module):
    def __init__(self, user_feature_dim, book_feature_dim, embedding_dim):
        super(DualEncoder, self).__init__()
        # Define both encoders
        self.user_encoder = UserEncoder(user_feature_dim, embedding_dim)
        self.book_encoder = BookEncoder(book_feature_dim, embedding_dim)

        # Perform forward pass on both
    def forward(self, user_features, book_features):
        user_embedding = self.user_encoder(user_features)
        book_embedding = self.book_encoder(book_features)
        return user_embedding, book_embedding


In [5]:
# Define loss function
class CosineSimilarityLoss(nn.Module):
    def __init__(self):
        super(CosineSimilarityLoss, self).__init__()

    def forward(self, user_embedding, book_embedding, labels):
        # Calculate the cosine similarity of the embeddings
        cosine_sim = F.cosine_similarity(user_embedding, book_embedding)
        # We want them to be similar i.e close to 1 - as we only include relevant pairs, the expected similarity is 1
        loss = F.mse_loss(cosine_sim, labels)
        return loss

In [6]:
# Data preparation - USERS
user_data = pd.read_csv('full_user_train_rep.csv')

user_data['Age'] = pd.to_numeric(user_data['Age'], errors='coerce')
# There was 12 users whose age could not be converted to a number
user_data_clean = user_data.dropna()

# Update column names for clarity

# range is 13580
new_columns = [f'User_{i}' for i in range(13581)]
correct_columns = ['User-ID'] + new_columns + ['Age']

user_data_clean.columns = correct_columns
# display(user_data_clean.head(10))

# Double each user - as we will have 2 relevant books per user
users_doubled = user_data_clean.loc[user_data_clean.index.repeat(2)].reset_index(drop=True)

# We don't want to keep the user id
no_user_id = users_doubled.drop(columns=['User-ID'])

# Final Users tensor to be passed during training
users_tensor = torch.tensor(no_user_id.to_numpy(), dtype=torch.float32)

  user_data = pd.read_csv('full_user_train_rep.csv')


In [7]:
# Data Preparation - Books
train_set_2 = pd.read_csv('train_set_2.csv')
# display(train_set_2)
books = pd.read_csv('Books_BX_10_5_FINAL.csv')

# Columns to drop
columns=["Title", "Image-URL-S", "Image-URL-M", "Image-URL-L", "Best-Image-URL"]

# Hash Author and Publisher

def hash_encode(value):
    return hash(value) % (10**9)

categorical_columns = ['Author', 'Publisher']

for col in categorical_columns:
    books[col] = books[col].astype(str).apply(hash_encode)

# Drop columns which won't be used in training at all
books_rep_visual = books.drop(columns=columns)

visual_feature_cols = [str(i) for i in range(2048)]
# Drop visual features
books_rep_basic = books_rep_visual.drop(columns=visual_feature_cols, errors="ignore")
# display(books_rep_basic.head(10))

# Connect the books to users
train_set_selected = train_set_2[['User-ID','ISBN']]
books_merged_basic = train_set_selected.merge(books_rep_basic, on='ISBN',how = 'inner')

all_data = user_data_clean[['User-ID']].merge(books_merged_basic, on='User-ID',how='inner')
# display(all_data.head(20))

# Drop column and create book tensor
books_no_ids = all_data.drop(columns = ['User-ID','ISBN'])
books_tensor =  torch.tensor(books_no_ids.to_numpy(), dtype=torch.float32)

# Check the shapes
print(users_tensor.shape)
print(books_tensor.shape)

torch.Size([3974, 13582])
torch.Size([3974, 387])


In [8]:
from torch.utils.data import DataLoader, TensorDataset
# Training
user_feature_dim = users_tensor.shape[1]
book_feature_dim = books_tensor.shape[1]
labels =torch.ones(users_tensor.shape[0])
num_epochs = 5


dataset = TensorDataset(users_tensor, books_tensor, torch.ones(users_tensor.shape[0], dtype=torch.float))
batch_size = 16
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)


embedding_dim = 256

model = DualEncoder(user_feature_dim, book_feature_dim, embedding_dim)
loss_fn = CosineSimilarityLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(num_epochs):
  for user_batch, book_batch, label_batch in data_loader:

    model.train()

    #Get embeddings
    usr_embd, book_embd = model(user_batch,book_batch)

    loss = loss_fn(usr_embd,book_embd,label_batch)

    # Backward propagation
    loss.backward()

    #Update parameters
    optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")



Epoch [1/5], Loss: 1.0569169521331787
Epoch [1/5], Loss: 0.8247098922729492
Epoch [1/5], Loss: 0.41860368847846985
Epoch [1/5], Loss: 0.2219589501619339
Epoch [1/5], Loss: 0.12274421006441116
Epoch [1/5], Loss: 0.07032165676355362
Epoch [1/5], Loss: 0.04257480800151825
Epoch [1/5], Loss: 0.027114387601614
Epoch [1/5], Loss: 0.0184348002076149
Epoch [1/5], Loss: 0.01329159177839756
Epoch [1/5], Loss: 0.010172995738685131
Epoch [1/5], Loss: 0.008261635899543762
Epoch [1/5], Loss: 0.00708016287535429
Epoch [1/5], Loss: 0.006351304240524769
Epoch [1/5], Loss: 0.005920032970607281
Epoch [1/5], Loss: 0.0056839738972485065
Epoch [1/5], Loss: 0.005578560754656792
Epoch [1/5], Loss: 0.00557099049910903
Epoch [1/5], Loss: 0.005633586086332798
Epoch [1/5], Loss: 0.005748359486460686
Epoch [1/5], Loss: 0.00588560663163662
Epoch [1/5], Loss: 0.0060531869530677795
Epoch [1/5], Loss: 0.006207178346812725
Epoch [1/5], Loss: 0.006404654588550329
Epoch [1/5], Loss: 0.006573192309588194
Epoch [1/5], Loss

In [9]:
# Prepare the data for LTR task
user_embd_train = []
book_embd_train = []

model.eval()
with torch.no_grad():
  # Same dataset
  for user_batch,book_batch,_ in data_loader:
    # Pass through the model
    user_embedding, book_embedding = model(user_batch, book_batch)

    user_embd_train.append(user_embedding.cpu().numpy())
    book_embd_train.append(book_embedding.cpu().numpy())

user_embeddings = np.concatenate(user_embd_train, axis=0)
book_embeddings = np.concatenate(book_embd_train, axis=0)

user_emb_df = pd.DataFrame(user_embeddings, columns=[f'User_{i}' for i in range(user_embeddings.shape[1])])
book_emb_df = pd.DataFrame(book_embeddings, columns=[f'Book_{i}' for i in range(user_embeddings.shape[1])])



In [10]:
# Connect ISBN and user Ids
user_emb_df['User-ID'] = users_doubled['User-ID']
book_emb_df['ISBN'] = all_data['ISBN']

# display(user_emb_df.head())
# display(book_emb_df.head())

# Merge on index

final_train = pd.concat([user_emb_df, book_emb_df], axis=1)

# ADD RELEVANCE OF 1
final_train['Relevance'] = 1


display(final_train[['ISBN','User-ID']].head())

final_train.to_csv('training_embeddings_basic_rel1.csv',index=False)



Unnamed: 0,ISBN,User-ID
0,037582345X,254
1,0380973634,254
2,0316769487,638
3,0385504209,638
4,2253043974,643


In [22]:
# Read 3 random books

irrelevant_train = pd.read_csv('train_sample_irrelevant_3.csv')
# display(irrelevant_train.head())

# Encode Author and Publisher

def hash_encode(value):
    return hash(value) % (10**9)

categorical_columns = ['Author', 'Publisher']

for col in categorical_columns:
    irrelevant_train[col] = irrelevant_train[col].astype(str).apply(hash_encode)

# Merge with User representation
full_data_merged = user_data_clean.merge(irrelevant_train,on='User-ID')
display(full_data_merged.head())

user_columns = user_data_clean.columns
book_columns = irrelevant_train.columns



Unnamed: 0,User-ID,User_0,User_1,User_2,User_3,User_4,User_5,User_6,User_7,User_8,...,Title_Embed_374,Title_Embed_375,Title_Embed_376,Title_Embed_377,Title_Embed_378,Title_Embed_379,Title_Embed_380,Title_Embed_381,Title_Embed_382,Title_Embed_383
0,254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.035193,-0.039807,-0.027373,-0.026081,-0.0373,-0.010246,0.084221,-0.020611,-0.007792,-0.019594
1,254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.008007,-0.025034,0.000346,0.043401,0.014144,-0.023983,-0.055745,0.08068,0.026684,0.006322
2,254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.005636,0.062488,0.010356,-0.004994,0.039333,-0.040253,0.063151,0.014033,0.014097,-0.029388
3,638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.021229,-0.032242,-0.009617,0.020536,-0.070838,-0.021525,0.039163,-0.016212,-0.089582,0.061062
4,638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.000118,0.002336,0.026127,0.010337,-0.156506,-0.011518,0.041648,-0.016254,0.022333,0.048653


In [23]:
user_info = full_data_merged[user_columns]
no_id_user = user_info.drop(columns=['User-ID'])

user_irr_tensor = torch.tensor(no_id_user.to_numpy(), dtype=torch.float32)


torch.Size([5961, 13582])


In [29]:
book_info = full_data_merged[book_columns]

book_no_id = book_info.drop(columns=['ISBN','User-ID'])
display(book_no_id.head())

book_irr_tensor = torch.tensor(book_no_id.to_numpy(), dtype=torch.float32)

Unnamed: 0,Author,Year,Publisher,Title_Embed_0,Title_Embed_1,Title_Embed_2,Title_Embed_3,Title_Embed_4,Title_Embed_5,Title_Embed_6,...,Title_Embed_374,Title_Embed_375,Title_Embed_376,Title_Embed_377,Title_Embed_378,Title_Embed_379,Title_Embed_380,Title_Embed_381,Title_Embed_382,Title_Embed_383
0,734294063,2000,419680172,-0.071472,-0.017195,-0.022739,0.044248,-0.087759,0.019732,0.047838,...,-0.035193,-0.039807,-0.027373,-0.026081,-0.0373,-0.010246,0.084221,-0.020611,-0.007792,-0.019594
1,402379821,2000,706369737,-0.123304,-0.016257,0.006948,-0.025575,-0.040283,-0.007436,0.068902,...,-0.008007,-0.025034,0.000346,0.043401,0.014144,-0.023983,-0.055745,0.08068,0.026684,0.006322
2,330192281,1996,272862081,0.023019,0.018677,0.06494,0.056871,-0.121409,0.022455,0.061694,...,-0.005636,0.062488,0.010356,-0.004994,0.039333,-0.040253,0.063151,0.014033,0.014097,-0.029388
3,43050484,1997,351448452,-0.035431,0.112442,0.057332,0.084723,0.005372,-0.008475,0.010908,...,0.021229,-0.032242,-0.009617,0.020536,-0.070838,-0.021525,0.039163,-0.016212,-0.089582,0.061062
4,621702691,1998,729430617,-0.042678,0.053652,-0.027692,0.003111,-0.086905,0.025198,0.081422,...,-0.000118,0.002336,0.026127,0.010337,-0.156506,-0.011518,0.041648,-0.016254,0.022333,0.048653


In [30]:
print(user_irr_tensor.shape)
print(book_irr_tensor.shape)

torch.Size([5961, 13582])
torch.Size([5961, 387])


In [31]:
# Prepare the data for LTR task
user_embd_train_irr = []
book_embd_train_irr = []


dataset_irr = TensorDataset(user_irr_tensor, book_irr_tensor, torch.zeros(user_irr_tensor.shape[0], dtype=torch.float))
batch_size = 16
data_loader_irr = DataLoader(dataset_irr, batch_size=batch_size, shuffle=False)

model.eval()
with torch.no_grad():
  # Same dataset
  for user_batch,book_batch,_ in data_loader_irr:
    # Pass through the model
    user_embedding, book_embedding = model(user_batch, book_batch)

    user_embd_train_irr.append(user_embedding.cpu().numpy())
    book_embd_train_irr.append(book_embedding.cpu().numpy())

user_embeddings_irr = np.concatenate(user_embd_train_irr, axis=0)
book_embeddings_irr = np.concatenate(book_embd_train_irr, axis=0)

user_emb_df_irr = pd.DataFrame(user_embeddings_irr, columns=[f'User_{i}' for i in range(user_embeddings_irr.shape[1])])
book_emb_df_irr = pd.DataFrame(book_embeddings_irr, columns=[f'Book_{i}' for i in range(book_embeddings_irr.shape[1])])



In [39]:
# Connect ISBN and user Ids
user_emb_df_irr['User-ID'] = user_info['User-ID']
book_emb_df_irr['ISBN'] = book_info['ISBN']

# display(user_emb_df_irr.head())
# display(book_emb_df_irr.head())

# Merge on index

final_train_irr = pd.concat([user_emb_df_irr, book_emb_df_irr], axis=1)

# ADD RELEVANCE OF 1
final_train_irr['Relevance'] = 0


display(final_train_irr[['ISBN','User-ID']].head())

final_train_irr.to_csv('training_embeddings_basic_rel0.csv',index=False)

Unnamed: 0,ISBN,User-ID
0,0440224675,254
1,0671042858,254
2,039914255X,254
3,039304016X,638
4,0385490992,638


In [42]:
# Concatenate both irrelevant and relevant books for training
final_train = pd.read_csv('training_embeddings_basic_rel1.csv')

training_basic_final = pd.concat([final_train_irr,final_train],axis=0).sort_values(by='User-ID')
display(training_basic_final[['User-ID','Relevance',]].head(20))
training_basic_final.to_csv('training_final_basic_embd.csv',index=False)

Unnamed: 0,User-ID,Relevance
0,254,0
1,254,0
2,254,0
1,254,1
0,254,1
5,638,0
2,638,1
3,638,1
4,638,0
3,638,0


In [45]:
# Data preparation - USERS TEST
user_data_test = pd.read_csv('full_user_test_rep.csv')

user_data_test['Age'] = pd.to_numeric(user_data_test['Age'], errors='coerce')
# There was 12 users whose age could not be converted to a number
user_data_test_clean = user_data_test.dropna()

# Update column names for clarity

# range is 13580
new_columns = [f'User_{i}' for i in range(13581)]
correct_columns = ['User-ID'] + new_columns + ['Age']

user_data_test_clean.columns = correct_columns
# display(user_data_clean.head(10))

# Double each user - as we will have 2 relevant books per user
users_doubled_test = user_data_test_clean.loc[user_data_test_clean.index.repeat(2)].reset_index(drop=True)

# We don't want to keep the user id
no_user_id_test = users_doubled_test.drop(columns=['User-ID'])

# Final Users tensor to be passed during training
users_tensor_test = torch.tensor(no_user_id_test.to_numpy(), dtype=torch.float32)


  user_data_test = pd.read_csv('full_user_test_rep.csv')


In [47]:
# Data Preparation - Books
test_set_2 = pd.read_csv('test_set_2.csv')
# display(train_set_2)
books = pd.read_csv('Books_BX_10_5_FINAL.csv')

# Columns to drop
columns=["Title", "Image-URL-S", "Image-URL-M", "Image-URL-L", "Best-Image-URL"]

# Hash Author and Publisher

def hash_encode(value):
    return hash(value) % (10**9)

categorical_columns = ['Author', 'Publisher']

for col in categorical_columns:
    books[col] = books[col].astype(str).apply(hash_encode)

# Drop columns which won't be used in training at all
books_rep_visual = books.drop(columns=columns)

visual_feature_cols = [str(i) for i in range(2048)]
# Drop visual features
books_rep_basic = books_rep_visual.drop(columns=visual_feature_cols, errors="ignore")
# display(books_rep_basic.head(10))

# Connect the books to users - TEST
test_set_selected = test_set_2[['User-ID','ISBN']]
books_merged_basic_test = test_set_selected.merge(books_rep_basic, on='ISBN',how = 'inner')

all_data_test = user_data_test_clean[['User-ID']].merge(books_merged_basic_test, on='User-ID',how='inner')
# display(all_data.head(20))

# Drop column and create book tensor
books_no_ids_test = all_data_test.drop(columns = ['User-ID','ISBN'])
books_tensor_test =  torch.tensor(books_no_ids_test.to_numpy(), dtype=torch.float32)


In [48]:
print(books_tensor_test.shape)
print(users_tensor_test.shape)

torch.Size([1102, 387])
torch.Size([1102, 13582])


In [49]:
# Testing tensors are prepared, now we pass it through the model to obtain the embeddings

# New tensor dataset from the testing set
dataset_test = TensorDataset(users_tensor_test, books_tensor_test, torch.ones(users_tensor_test.shape[0], dtype=torch.float))
batch_size = 16
data_loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

# Prepare the data for LTR task
user_embd_test = []
book_embd_test = []

model.eval()
with torch.no_grad():
  # Same dataset
  for user_batch,book_batch,_ in data_loader_test:
    # Pass through the model
    user_embedding, book_embedding = model(user_batch, book_batch)

    user_embd_test.append(user_embedding.cpu().numpy())
    book_embd_test.append(book_embedding.cpu().numpy())

user_embeddings_test = np.concatenate(user_embd_test, axis=0)
book_embeddings_test = np.concatenate(book_embd_test, axis=0)

user_emb_df_test = pd.DataFrame(user_embeddings_test, columns=[f'User_{i}' for i in range(user_embeddings.shape[1])])
book_emb_df_test = pd.DataFrame(book_embeddings_test, columns=[f'Book_{i}' for i in range(user_embeddings.shape[1])])

In [50]:
# Connect ISBN and user Ids
user_emb_df_test['User-ID'] = users_doubled_test['User-ID']
book_emb_df_test['ISBN'] = all_data_test['ISBN']

# Merge on index

final_test = pd.concat([user_emb_df_test, book_emb_df_test], axis=1)
final_test ['Relevance'] = 1
display(final_test[['ISBN','User-ID']].head())

final_test.to_csv('testing_embeddings_basic_rel1.csv',index=False)

Unnamed: 0,ISBN,User-ID
0,345370805,1131
1,60934417,1131
2,375727345,3373
3,61099368,3373
4,385497288,3827


In [53]:
# Now the irrelevant books for the testing set

# Read 3 random books

irrelevant_test = pd.read_csv('test_sample_irrelevant_3.csv')
# display(irrelevant_train.head())

# Encode Author and Publisher

def hash_encode(value):
    return hash(value) % (10**9)

categorical_columns = ['Author', 'Publisher']

for col in categorical_columns:
    irrelevant_test[col] = irrelevant_test[col].astype(str).apply(hash_encode)

# Merge with User representation

full_data_merged_test = user_data_test_clean.merge(irrelevant_test,on='User-ID')
display(full_data_merged_test.head())

user_columns = user_data_test_clean.columns
book_columns = irrelevant_test.columns



Unnamed: 0,User-ID,User_0,User_1,User_2,User_3,User_4,User_5,User_6,User_7,User_8,...,Title_Embed_374,Title_Embed_375,Title_Embed_376,Title_Embed_377,Title_Embed_378,Title_Embed_379,Title_Embed_380,Title_Embed_381,Title_Embed_382,Title_Embed_383
0,1131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.014865,-0.056009,0.050711,-0.056293,-0.057939,-0.001921,0.106657,-0.060671,0.013855,0.032901
1,1131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.044968,0.033977,-0.052604,0.073571,-0.096708,0.030385,0.073928,0.039965,0.048389,0.050965
2,1131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.092119,0.065753,-0.031898,0.086305,-0.012491,0.01466,0.107215,-0.067487,-0.114209,0.032433
3,3373,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.052249,0.004999,0.020069,0.025216,0.01668,-0.07544,0.072254,0.01992,0.024912,0.043283
4,3373,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.076594,-0.03864,-0.041297,0.06124,-0.038876,-0.001752,0.007765,0.051157,-0.038728,-0.08273


In [54]:
user_info = full_data_merged_test[user_columns]
no_id_user = user_info.drop(columns=['User-ID'])

user_irr_tensor = torch.tensor(no_id_user.to_numpy(), dtype=torch.float32)

book_info = full_data_merged_test[book_columns]

book_no_id = book_info.drop(columns=['ISBN','User-ID'])
display(book_no_id.head())

book_irr_tensor = torch.tensor(book_no_id.to_numpy(), dtype=torch.float32)
print(user_irr_tensor.shape)
print(book_irr_tensor.shape)

Unnamed: 0,Author,Year,Publisher,Title_Embed_0,Title_Embed_1,Title_Embed_2,Title_Embed_3,Title_Embed_4,Title_Embed_5,Title_Embed_6,...,Title_Embed_374,Title_Embed_375,Title_Embed_376,Title_Embed_377,Title_Embed_378,Title_Embed_379,Title_Embed_380,Title_Embed_381,Title_Embed_382,Title_Embed_383
0,640744729,1998,612844110,-0.068912,0.149072,-0.048364,0.020375,-0.002855,0.037009,0.102814,...,-0.014865,-0.056009,0.050711,-0.056293,-0.057939,-0.001921,0.106657,-0.060671,0.013855,0.032901
1,758894149,2002,371322279,-0.015563,-0.01137,-0.049947,0.040009,-0.006936,0.078584,0.020533,...,0.044968,0.033977,-0.052604,0.073571,-0.096708,0.030385,0.073928,0.039965,0.048389,0.050965
2,328856332,1996,748869035,-0.025395,0.013658,0.090005,0.03403,-0.001295,-0.033201,-0.061194,...,0.092119,0.065753,-0.031898,0.086305,-0.012491,0.01466,0.107215,-0.067487,-0.114209,0.032433
3,457302700,2000,923621230,0.000481,0.063785,-0.022737,-0.020254,0.044928,-0.012015,0.038725,...,0.052249,0.004999,0.020069,0.025216,0.01668,-0.07544,0.072254,0.01992,0.024912,0.043283
4,229880291,1995,430773639,-0.048831,-0.009859,0.077872,0.020068,0.001736,-0.024832,0.118934,...,0.076594,-0.03864,-0.041297,0.06124,-0.038876,-0.001752,0.007765,0.051157,-0.038728,-0.08273


torch.Size([1653, 13582])
torch.Size([1653, 387])


In [56]:
# Prepare the data for LTR task
user_embd_test_irr = []
book_embd_test_irr = []


dataset_irr = TensorDataset(user_irr_tensor, book_irr_tensor, torch.zeros(user_irr_tensor.shape[0], dtype=torch.float))
batch_size = 16
data_loader_irr = DataLoader(dataset_irr, batch_size=batch_size, shuffle=False)

model.eval()
with torch.no_grad():
  # Same dataset
  for user_batch,book_batch,_ in data_loader_irr:
    # Pass through the model
    user_embedding, book_embedding = model(user_batch, book_batch)

    user_embd_test_irr.append(user_embedding.cpu().numpy())
    book_embd_test_irr.append(book_embedding.cpu().numpy())

user_embeddings_irr = np.concatenate(user_embd_test_irr, axis=0)
book_embeddings_irr = np.concatenate(book_embd_test_irr, axis=0)

user_emb_df_irr = pd.DataFrame(user_embeddings_irr, columns=[f'User_{i}' for i in range(user_embeddings_irr.shape[1])])
book_emb_df_irr = pd.DataFrame(book_embeddings_irr, columns=[f'Book_{i}' for i in range(book_embeddings_irr.shape[1])])



In [57]:
# Connect ISBN and user Ids
user_emb_df_irr['User-ID'] = user_info['User-ID']
book_emb_df_irr['ISBN'] = book_info['ISBN']

# display(user_emb_df_irr.head())
# display(book_emb_df_irr.head())

# Merge on index

final_test_irr = pd.concat([user_emb_df_irr, book_emb_df_irr], axis=1)

# ADD RELEVANCE OF 0
final_test_irr['Relevance'] = 0


display(final_test_irr[['ISBN','User-ID']].head())

final_test_irr.to_csv('testing_embeddings_basic_rel0.csv',index=False)

Unnamed: 0,ISBN,User-ID
0,0374216495,1131
1,0446610038,1131
2,089480829X,1131
3,0743212002,3373
4,0345401123,3373


In [58]:
# Concatenate both irrelevant and relevant books for testing
test_basic_final = pd.concat([final_test_irr,final_test],axis=0).sort_values(by='User-ID')
display(test_basic_final[['User-ID','Relevance',]].head(20))
test_basic_final.to_csv('testing_final_basic_embd.csv',index=False)

Unnamed: 0,User-ID,Relevance
0,1131,0
1,1131,0
1,1131,1
0,1131,1
2,1131,0
5,3373,0
3,3373,1
2,3373,1
4,3373,0
3,3373,0
