In [1]:
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

In [3]:
# Define Encoder Classes
class UserEncoder(nn.Module):
    def __init__(self, user_feature_dim, embedding_dim):
        super(UserEncoder, self).__init__()

        # Architecture of User encoder
        self.fc = nn.Sequential(
            nn.Linear(user_feature_dim, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, embedding_dim))

    def forward(self, user_features):
        user_embedding = self.fc(user_features)

        # Normalize for similarity
        return F.normalize(user_embedding, p=2, dim=-1)

class BookEncoder(nn.Module):
    def __init__(self, book_feature_dim, embedding_dim):
        super(BookEncoder, self).__init__()

        # Architecture of Book Encoder
        self.fc = nn.Sequential(
            nn.Linear(book_feature_dim, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, embedding_dim))

    def forward(self, book_features):
        book_embedding = self.fc(book_features)

        # Normalize for similarity
        return F.normalize(book_embedding, p=2, dim=-1)

# Simply combine both into one class
class DualEncoder(nn.Module):
    def __init__(self, user_feature_dim, book_feature_dim, embedding_dim):
        super(DualEncoder, self).__init__()
        # Define both encoders
        self.user_encoder = UserEncoder(user_feature_dim, embedding_dim)
        self.book_encoder = BookEncoder(book_feature_dim, embedding_dim)

        # Perform forward pass on both
    def forward(self, user_features, book_features):
        user_embedding = self.user_encoder(user_features)
        book_embedding = self.book_encoder(book_features)
        return user_embedding, book_embedding


In [4]:
# Define loss function
class CosineSimilarityLoss(nn.Module):
    def __init__(self):
        super(CosineSimilarityLoss, self).__init__()

    def forward(self, user_embedding, book_embedding, labels):
        # Calculate the cosine similarity of the embeddings
        cosine_sim = F.cosine_similarity(user_embedding, book_embedding)
        # We want them to be similar i.e close to 1 - as we only include relevant pairs, the expected similarity is 1
        loss = F.mse_loss(cosine_sim, labels)
        return loss

In [6]:
# Data preparation - USERS
user_data = pd.read_csv('full_user_train_rep.csv')

user_data['Age'] = pd.to_numeric(user_data['Age'], errors='coerce')
# There was 12 users whose age could not be converted to a number
user_data_clean = user_data.dropna()

# Update column names for clarity

# range is 13580
new_columns = [f'User_{i}' for i in range(13581)]
correct_columns = ['User-ID'] + new_columns + ['Age']

user_data_clean.columns = correct_columns
# display(user_data_clean.head(10))

# Double each user - as we will have 2 relevant books per user
users_doubled = user_data_clean.loc[user_data_clean.index.repeat(2)].reset_index(drop=True)

# We don't want to keep the user id
no_user_id = users_doubled.drop(columns=['User-ID'])

# Final Users tensor to be passed during training
users_tensor = torch.tensor(no_user_id.to_numpy(), dtype=torch.float32)

  user_data = pd.read_csv('full_user_train_rep.csv')


In [7]:
# Data Preparation - Books
train_set_2 = pd.read_csv('train_set_2.csv')
# display(train_set_2)
books = pd.read_csv('Books_BX_10_5_FINAL.csv')

# Columns to drop
columns=["Title", "Image-URL-S", "Image-URL-M", "Image-URL-L", "Best-Image-URL"]

# Hash Author and Publisher

def hash_encode(value):
    return hash(value) % (10**9)

categorical_columns = ['Author', 'Publisher']

for col in categorical_columns:
    books[col] = books[col].astype(str).apply(hash_encode)

# Drop columns which won't be used in training at all
books_rep_visual = books.drop(columns=columns)

# Connect the books to users
train_set_selected = train_set_2[['User-ID','ISBN']]
books_merged_visual = train_set_selected.merge(books_rep_visual, on='ISBN',how = 'inner')

all_data = user_data_clean[['User-ID']].merge(books_merged_visual, on='User-ID',how='inner')
# display(all_data.head(20))

# Drop column and create book tensor
books_no_ids = all_data.drop(columns = ['User-ID','ISBN'])
books_tensor =  torch.tensor(books_no_ids.to_numpy(), dtype=torch.float32)

# Check the shapes
print(users_tensor.shape)
print(books_tensor.shape)

torch.Size([3974, 13582])
torch.Size([3974, 2435])


In [8]:
from torch.utils.data import DataLoader, TensorDataset
# Training
user_feature_dim = users_tensor.shape[1]
book_feature_dim = books_tensor.shape[1]
labels =torch.ones(users_tensor.shape[0])
num_epochs = 5


dataset = TensorDataset(users_tensor, books_tensor, torch.ones(users_tensor.shape[0], dtype=torch.float))
batch_size = 16
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)


embedding_dim = 512

model = DualEncoder(user_feature_dim, book_feature_dim, embedding_dim)
loss_fn = CosineSimilarityLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(num_epochs):
  for user_batch, book_batch, label_batch in data_loader:

    model.train()

    #Get embeddings
    usr_embd, book_embd = model(user_batch,book_batch)

    loss = loss_fn(usr_embd,book_embd,label_batch)

    # Backward propagation
    loss.backward()

    #Update parameters
    optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")



Epoch [1/5], Loss: 1.0465359687805176
Epoch [1/5], Loss: 0.8518728017807007
Epoch [1/5], Loss: 0.47274473309516907
Epoch [1/5], Loss: 0.26963984966278076
Epoch [1/5], Loss: 0.15741612017154694
Epoch [1/5], Loss: 0.09421945363283157
Epoch [1/5], Loss: 0.058638058602809906
Epoch [1/5], Loss: 0.03763062134385109
Epoch [1/5], Loss: 0.02558017708361149
Epoch [1/5], Loss: 0.018133247271180153
Epoch [1/5], Loss: 0.0133892260491848
Epoch [1/5], Loss: 0.010499819181859493
Epoch [1/5], Loss: 0.008585115894675255
Epoch [1/5], Loss: 0.007352353539317846
Epoch [1/5], Loss: 0.006572023965418339
Epoch [1/5], Loss: 0.006049525924026966
Epoch [1/5], Loss: 0.005752257537096739
Epoch [1/5], Loss: 0.005578045733273029
Epoch [1/5], Loss: 0.0055156853049993515
Epoch [1/5], Loss: 0.005529386922717094
Epoch [1/5], Loss: 0.005600281059741974
Epoch [1/5], Loss: 0.005713175516575575
Epoch [1/5], Loss: 0.0058409241028130054
Epoch [1/5], Loss: 0.006020063534379005
Epoch [1/5], Loss: 0.006191236432641745
Epoch [1/5

In [20]:
# Prepare the data for LTR task
user_embd_train = []
book_embd_train = []

model.eval()
with torch.no_grad():
  # Same dataset
  for user_batch,book_batch,_ in data_loader:
    # Pass through the model
    user_embedding, book_embedding = model(user_batch, book_batch)

    user_embd_train.append(user_embedding.cpu().numpy())
    book_embd_train.append(book_embedding.cpu().numpy())

user_embeddings = np.concatenate(user_embd_train, axis=0)
book_embeddings = np.concatenate(book_embd_train, axis=0)

user_emb_df = pd.DataFrame(user_embeddings, columns=[f'User_{i}' for i in range(user_embeddings.shape[1])])
book_emb_df = pd.DataFrame(book_embeddings, columns=[f'Book_{i}' for i in range(user_embeddings.shape[1])])



In [21]:
# Connect ISBN and user Ids
user_emb_df['User-ID'] = users_doubled['User-ID']
book_emb_df['ISBN'] = all_data['ISBN']

# display(user_emb_df.head())
# display(book_emb_df.head())

# Merge on index

final_train = pd.concat([user_emb_df, book_emb_df], axis=1)

# ADD RELEVANCE OF 1
final_train['Relevance'] = 1


display(final_train[['ISBN','User-ID']].head())

final_train.to_csv('training_embeddings_visual_rel1.csv',index=False)



Unnamed: 0,ISBN,User-ID
0,037582345X,254
1,0380973634,254
2,0316769487,638
3,0385504209,638
4,2253043974,643


In [22]:
# Read 3 random books

irrelevant_train = pd.read_csv('train_sample_irrelevant_visual_3.csv')
# display(irrelevant_train.head())

# Encode Author and Publisher

def hash_encode(value):
    return hash(value) % (10**9)

categorical_columns = ['Author', 'Publisher']

for col in categorical_columns:
    irrelevant_train[col] = irrelevant_train[col].astype(str).apply(hash_encode)

# Merge with User representation
full_data_merged = user_data_clean.merge(irrelevant_train,on='User-ID')
display(full_data_merged.head())

user_columns = user_data_clean.columns
book_columns = irrelevant_train.columns



Unnamed: 0,User-ID,User_0,User_1,User_2,User_3,User_4,User_5,User_6,User_7,User_8,...,Title_Embed_374,Title_Embed_375,Title_Embed_376,Title_Embed_377,Title_Embed_378,Title_Embed_379,Title_Embed_380,Title_Embed_381,Title_Embed_382,Title_Embed_383
0,254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.009798,-0.003879,-0.055282,-1.5e-05,-0.148504,0.075159,0.030464,-0.075416,0.016292,-0.022645
1,254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.072952,0.010924,-0.072501,0.025365,-0.056311,0.016682,0.085305,-0.010718,0.078291,-0.05611
2,254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.037816,-0.016315,0.128883,0.068807,-0.061976,0.016288,0.092884,-0.106677,-0.045714,0.001492
3,638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030799,0.029716,-0.053587,-0.011605,0.031136,0.005537,0.046982,-0.016018,0.01635,-0.031829
4,638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.049521,0.009942,0.020799,0.036301,-0.009031,0.069625,0.023198,-0.069425,0.016261,0.050965


In [23]:
user_info = full_data_merged[user_columns]
no_id_user = user_info.drop(columns=['User-ID'])

user_irr_tensor = torch.tensor(no_id_user.to_numpy(), dtype=torch.float32)


In [24]:
book_info = full_data_merged[book_columns]

book_no_id = book_info.drop(columns=['ISBN','User-ID'])
display(book_no_id.head())

book_irr_tensor = torch.tensor(book_no_id.to_numpy(), dtype=torch.float32)

Unnamed: 0,Author,Year,Publisher,0,1,2,3,4,5,6,...,Title_Embed_374,Title_Embed_375,Title_Embed_376,Title_Embed_377,Title_Embed_378,Title_Embed_379,Title_Embed_380,Title_Embed_381,Title_Embed_382,Title_Embed_383
0,888215315,1996,168845742,0.004229,0.182942,0.071875,0.121134,0.335756,0.0,1.085266,...,-0.009798,-0.003879,-0.055282,-1.5e-05,-0.148504,0.075159,0.030464,-0.075416,0.016292,-0.022645
1,632681304,1995,530980957,0.062531,0.025113,0.191599,0.003823,0.135379,0.0,1.405679,...,0.072952,0.010924,-0.072501,0.025365,-0.056311,0.016682,0.085305,-0.010718,0.078291,-0.05611
2,344963607,1997,495567444,0.04527,0.063049,0.406684,0.023115,0.265711,0.0,0.657952,...,0.037816,-0.016315,0.128883,0.068807,-0.061976,0.016288,0.092884,-0.106677,-0.045714,0.001492
3,130520541,2002,932830971,0.050934,0.129651,0.333244,0.121564,0.198525,0.0,0.886906,...,0.030799,0.029716,-0.053587,-0.011605,0.031136,0.005537,0.046982,-0.016018,0.01635,-0.031829
4,150132929,2001,86226156,0.230289,0.052501,0.262423,0.036705,0.057197,0.0,1.498381,...,0.049521,0.009942,0.020799,0.036301,-0.009031,0.069625,0.023198,-0.069425,0.016261,0.050965


In [26]:
print(user_irr_tensor.shape)
print(book_irr_tensor.shape)

torch.Size([5961, 13582])
torch.Size([5961, 2435])


In [27]:
# Prepare the data for LTR task
user_embd_train_irr = []
book_embd_train_irr = []


dataset_irr = TensorDataset(user_irr_tensor, book_irr_tensor, torch.zeros(user_irr_tensor.shape[0], dtype=torch.float))
batch_size = 16
data_loader_irr = DataLoader(dataset_irr, batch_size=batch_size, shuffle=False)

model.eval()
with torch.no_grad():
  # Same dataset
  for user_batch,book_batch,_ in data_loader_irr:
    # Pass through the model
    user_embedding, book_embedding = model(user_batch, book_batch)

    user_embd_train_irr.append(user_embedding.cpu().numpy())
    book_embd_train_irr.append(book_embedding.cpu().numpy())

user_embeddings_irr = np.concatenate(user_embd_train_irr, axis=0)
book_embeddings_irr = np.concatenate(book_embd_train_irr, axis=0)

user_emb_df_irr = pd.DataFrame(user_embeddings_irr, columns=[f'User_{i}' for i in range(user_embeddings_irr.shape[1])])
book_emb_df_irr = pd.DataFrame(book_embeddings_irr, columns=[f'Book_{i}' for i in range(book_embeddings_irr.shape[1])])



In [28]:
# Connect ISBN and user Ids
user_emb_df_irr['User-ID'] = user_info['User-ID']
book_emb_df_irr['ISBN'] = book_info['ISBN']

# display(user_emb_df_irr.head())
# display(book_emb_df_irr.head())

# Merge on index

final_train_irr = pd.concat([user_emb_df_irr, book_emb_df_irr], axis=1)

# ADD RELEVANCE OF 1
final_train_irr['Relevance'] = 0


display(final_train_irr[['ISBN','User-ID']].head())

final_train_irr.to_csv('training_embeddings_visual_rel0.csv',index=False)

Unnamed: 0,ISBN,User-ID
0,812534867,254
1,671891677,254
2,440203430,254
3,804119902,638
4,61020648,638


In [29]:
# Concatenate both irrelevant and relevant books for training
final_train = pd.read_csv('training_embeddings_visual_rel1.csv')

training_basic_final = pd.concat([final_train_irr,final_train],axis=0).sort_values(by='User-ID')
display(training_basic_final[['User-ID','Relevance',]].head(20))
training_basic_final.to_csv('training_final_visual_embd.csv',index=False)

Unnamed: 0,User-ID,Relevance
0,254,0
1,254,0
2,254,0
1,254,1
0,254,1
5,638,0
2,638,1
3,638,1
4,638,0
3,638,0


In [42]:
# Data preparation - USERS TEST
user_data_test = pd.read_csv('full_user_test_rep.csv')

user_data_test['Age'] = pd.to_numeric(user_data_test['Age'], errors='coerce')
# There was 12 users whose age could not be converted to a number
user_data_test_clean = user_data_test.dropna()

# Update column names for clarity

# range is 13580
new_columns = [f'User_{i}' for i in range(13581)]
correct_columns = ['User-ID'] + new_columns + ['Age']

user_data_test_clean.columns = correct_columns
# display(user_data_clean.head(10))

# Double each user - as we will have 2 relevant books per user
users_doubled_test = user_data_test_clean.loc[user_data_test_clean.index.repeat(2)].reset_index(drop=True)

# We don't want to keep the user id
no_user_id_test = users_doubled_test.drop(columns=['User-ID'])

# Final Users tensor to be passed during training
users_tensor_test = torch.tensor(no_user_id_test.to_numpy(), dtype=torch.float32)


  user_data_test = pd.read_csv('full_user_test_rep.csv')


In [44]:
# Data Preparation - Books
test_set_2 = pd.read_csv('test_set_2.csv')
# display(train_set_2)
books = pd.read_csv('Books_BX_10_5_FINAL.csv')

# Columns to drop
columns=["Title", "Image-URL-S", "Image-URL-M", "Image-URL-L", "Best-Image-URL"]

# Hash Author and Publisher

def hash_encode(value):
    return hash(value) % (10**9)

categorical_columns = ['Author', 'Publisher']

for col in categorical_columns:
    books[col] = books[col].astype(str).apply(hash_encode)

# Drop columns which won't be used in training at all
books_rep_visual = books.drop(columns=columns)

visual_feature_cols = [str(i) for i in range(2048)]
# Drop visual features
books_rep_basic = books_rep_visual.drop(columns=visual_feature_cols, errors="ignore")
# display(books_rep_basic.head(10))

# Connect the books to users - TEST
test_set_selected = test_set_2[['User-ID','ISBN']]
books_merged_visual_test = test_set_selected.merge(books_rep_visual, on='ISBN',how = 'inner')

all_data_test = user_data_test_clean[['User-ID']].merge(books_merged_visual_test, on='User-ID',how='inner')
# display(all_data.head(20))

# Drop column and create book tensor
books_no_ids_test = all_data_test.drop(columns = ['User-ID','ISBN'])
books_tensor_test =  torch.tensor(books_no_ids_test.to_numpy(), dtype=torch.float32)


In [45]:
print(books_tensor_test.shape)
print(users_tensor_test.shape)

torch.Size([1102, 2435])
torch.Size([1102, 13582])


In [46]:
# Testing tensors are prepared, now we pass it through the model to obtain the embeddings

# New tensor dataset from the testing set
dataset_test = TensorDataset(users_tensor_test, books_tensor_test, torch.ones(users_tensor_test.shape[0], dtype=torch.float))
batch_size = 16
data_loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

# Prepare the data for LTR task
user_embd_test = []
book_embd_test = []

model.eval()
with torch.no_grad():
  # Same dataset
  for user_batch,book_batch,_ in data_loader_test:
    # Pass through the model
    user_embedding, book_embedding = model(user_batch, book_batch)

    user_embd_test.append(user_embedding.cpu().numpy())
    book_embd_test.append(book_embedding.cpu().numpy())

user_embeddings_test = np.concatenate(user_embd_test, axis=0)
book_embeddings_test = np.concatenate(book_embd_test, axis=0)

user_emb_df_test = pd.DataFrame(user_embeddings_test, columns=[f'User_{i}' for i in range(user_embeddings.shape[1])])
book_emb_df_test = pd.DataFrame(book_embeddings_test, columns=[f'Book_{i}' for i in range(user_embeddings.shape[1])])

In [47]:
# Connect ISBN and user Ids
user_emb_df_test['User-ID'] = users_doubled_test['User-ID']
book_emb_df_test['ISBN'] = all_data_test['ISBN']

# Merge on index

final_test = pd.concat([user_emb_df_test, book_emb_df_test], axis=1)
final_test ['Relevance'] = 1
display(final_test[['ISBN','User-ID']].head())

final_test.to_csv('testing_embeddings_visual_rel1.csv',index=False)

Unnamed: 0,ISBN,User-ID
0,345370805,1131
1,60934417,1131
2,375727345,3373
3,61099368,3373
4,385497288,3827


In [48]:
# Now the irrelevant books for the testing set

# Read 3 random books

irrelevant_test = pd.read_csv('test_sample_irrelevant_visual_3.csv')
# display(irrelevant_train.head())

# Encode Author and Publisher

def hash_encode(value):
    return hash(value) % (10**9)

categorical_columns = ['Author', 'Publisher']

for col in categorical_columns:
    irrelevant_test[col] = irrelevant_test[col].astype(str).apply(hash_encode)

# Merge with User representation

full_data_merged_test = user_data_test_clean.merge(irrelevant_test,on='User-ID')
display(full_data_merged_test.head())

user_columns = user_data_test_clean.columns
book_columns = irrelevant_test.columns



Unnamed: 0,User-ID,User_0,User_1,User_2,User_3,User_4,User_5,User_6,User_7,User_8,...,Title_Embed_374,Title_Embed_375,Title_Embed_376,Title_Embed_377,Title_Embed_378,Title_Embed_379,Title_Embed_380,Title_Embed_381,Title_Embed_382,Title_Embed_383
0,1131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.026087,-0.042178,-0.024556,0.095353,-0.053833,-0.025326,0.02714,-0.063787,0.061184,-0.094539
1,1131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.006055,0.043981,0.04754,-0.038883,-0.026719,-0.017057,0.124818,0.041083,-0.054282,0.072225
2,1131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.132165,-0.119679,0.031301,0.028387,-0.029462,0.066633,0.0311,-0.003153,0.07906,-0.037849
3,3373,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.017247,0.067024,0.022827,0.074489,-0.011012,0.053565,-0.016778,0.047107,0.065905,-0.057205
4,3373,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.056519,0.033304,0.023662,0.062748,-0.01689,-0.026746,0.083145,-0.029056,0.024114,-0.017086


In [49]:
user_info = full_data_merged_test[user_columns]
no_id_user = user_info.drop(columns=['User-ID'])

user_irr_tensor = torch.tensor(no_id_user.to_numpy(), dtype=torch.float32)

book_info = full_data_merged_test[book_columns]

book_no_id = book_info.drop(columns=['ISBN','User-ID'])
display(book_no_id.head())

book_irr_tensor = torch.tensor(book_no_id.to_numpy(), dtype=torch.float32)
print(user_irr_tensor.shape)
print(book_irr_tensor.shape)

Unnamed: 0,Author,Year,Publisher,0,1,2,3,4,5,6,...,Title_Embed_374,Title_Embed_375,Title_Embed_376,Title_Embed_377,Title_Embed_378,Title_Embed_379,Title_Embed_380,Title_Embed_381,Title_Embed_382,Title_Embed_383
0,499655718,0,868981317,0.195775,0.043231,0.115906,0.014144,0.102552,0.0,1.787791,...,0.026087,-0.042178,-0.024556,0.095353,-0.053833,-0.025326,0.02714,-0.063787,0.061184,-0.094539
1,627942059,2002,160249803,0.215931,0.053879,0.186404,0.021348,0.159798,0.0,1.648644,...,-0.006055,0.043981,0.04754,-0.038883,-0.026719,-0.017057,0.124818,0.041083,-0.054282,0.072225
2,627521051,2002,857916999,0.149017,0.056776,0.355515,0.174194,0.185738,0.0,1.456707,...,0.132165,-0.119679,0.031301,0.028387,-0.029462,0.066633,0.0311,-0.003153,0.07906,-0.037849
3,54851540,0,357204062,0.215238,0.048762,0.12731,0.006236,0.163748,0.0,1.808379,...,0.017247,0.067024,0.022827,0.074489,-0.011012,0.053565,-0.016778,0.047107,0.065905,-0.057205
4,425339761,2001,103958107,0.198321,0.02558,0.233638,0.00401,0.122318,0.0,1.719076,...,0.056519,0.033304,0.023662,0.062748,-0.01689,-0.026746,0.083145,-0.029056,0.024114,-0.017086


torch.Size([1653, 13582])
torch.Size([1653, 2435])


In [50]:
# Prepare the data for LTR task
user_embd_test_irr = []
book_embd_test_irr = []


dataset_irr = TensorDataset(user_irr_tensor, book_irr_tensor, torch.zeros(user_irr_tensor.shape[0], dtype=torch.float))
batch_size = 16
data_loader_irr = DataLoader(dataset_irr, batch_size=batch_size, shuffle=False)

model.eval()
with torch.no_grad():
  # Same dataset
  for user_batch,book_batch,_ in data_loader_irr:
    # Pass through the model
    user_embedding, book_embedding = model(user_batch, book_batch)

    user_embd_test_irr.append(user_embedding.cpu().numpy())
    book_embd_test_irr.append(book_embedding.cpu().numpy())

user_embeddings_irr = np.concatenate(user_embd_test_irr, axis=0)
book_embeddings_irr = np.concatenate(book_embd_test_irr, axis=0)

user_emb_df_irr = pd.DataFrame(user_embeddings_irr, columns=[f'User_{i}' for i in range(user_embeddings_irr.shape[1])])
book_emb_df_irr = pd.DataFrame(book_embeddings_irr, columns=[f'Book_{i}' for i in range(book_embeddings_irr.shape[1])])



In [51]:
# Connect ISBN and user Ids
user_emb_df_irr['User-ID'] = user_info['User-ID']
book_emb_df_irr['ISBN'] = book_info['ISBN']

# display(user_emb_df_irr.head())
# display(book_emb_df_irr.head())

# Merge on index

final_test_irr = pd.concat([user_emb_df_irr, book_emb_df_irr], axis=1)

# ADD RELEVANCE OF 0
final_test_irr['Relevance'] = 0


display(final_test_irr[['ISBN','User-ID']].head())

final_test_irr.to_csv('testing_embeddings_visual_rel0.csv',index=False)

Unnamed: 0,ISBN,User-ID
0,3518368540,1131
1,3453210719,1131
2,045120753X,1131
3,0330300822,3373
4,0385335377,3373


In [52]:
# Concatenate both irrelevant and relevant books for testing
test_basic_final = pd.concat([final_test_irr,final_test],axis=0).sort_values(by='User-ID')
display(test_basic_final[['User-ID','Relevance',]].head(20))
test_basic_final.to_csv('testing_final_visual_embd.csv',index=False)

Unnamed: 0,User-ID,Relevance
0,1131,0
1,1131,0
1,1131,1
0,1131,1
2,1131,0
5,3373,0
3,3373,1
2,3373,1
4,3373,0
3,3373,0
