In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/RecSys_Project/

Mounted at /content/drive
/content/drive/MyDrive/RecSys_Project


In [2]:
import pandas as pd

df_train = pd.read_csv('recommender_training_data.csv')
print(df_train.head())

    user_id  app_id  is_recommended
0    136433    6060            True
1   2189692  233980            True
2   2387112  204360            True
3  12383244     730            True
4  13097030  298630            True


In [3]:
#!pip install --quiet cornac==2.3.2 numpy optuna

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.4/51.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.5/31.5 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:

# Save
#np.save('/content/drive/MyDrive/RecSys_Project/item_image_features.npy', item_image_features)
import numpy as np
# Load
item_image_features = np.load('RN50_item_image_features.npy', allow_pickle=True).item()

In [5]:
df_sampled = df_train[df_train['is_recommended'].isin([1])]
ratings = list(
    zip(
        df_sampled['user_id'].astype(str),
        df_sampled['app_id'].astype(str),
        df_sampled['is_recommended'].astype(float)
    )
)

item_ids = list(item_image_features.keys()) # take a subset
image_features_matrix = np.vstack([item_image_features[iid] for iid in item_ids])
print(len(item_ids))

36606


In [6]:
# print(ratings)
# print(item_ids)
# print(image_features_matrix)
# print(f"Num ratings: {len(ratings)}")
# print(f"Num image features: {image_features_matrix.shape[0]}")
# print(f"First 5 item_ids: {item_ids[:5]}")
valid_items = set(item_ids)
filtered_ratings = [(u, i, r) for (u, i, r) in ratings if i in valid_items] # take a subset

print(f"Filtered ratings count: {len(filtered_ratings)}")

filtered_ratings = [(str(u), str(i), float(r)) for (u, i, r) in filtered_ratings]
item_ids = list(map(str, item_ids))


Filtered ratings count: 764443


## Visual Similarity Analysis

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import seaborn as sns
from PIL import Image
import requests
from io import BytesIO

# Load your embeddings
item_image_features = np.load('RN50_item_image_features.npy', allow_pickle=True).item()

# Convert to arrays for analysis
item_ids = list(item_image_features.keys())
embeddings_matrix = np.vstack([item_image_features[iid] for iid in item_ids])

# Normalize embeddings
from sklearn.preprocessing import normalize
embeddings_normalized = normalize(embeddings_matrix, norm='l2', axis=1)

# 1. Compute pairwise similarities
similarities = cosine_similarity(embeddings_normalized)

# Analyze similarity distribution
plt.figure(figsize=(10, 6))
plt.hist(similarities.flatten(), bins=100, alpha=0.7)
plt.title('Distribution of Pairwise Cosine Similarities')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.axvline(x=similarities.mean(), color='r', linestyle='--', label=f'Mean: {similarities.mean():.3f}')
plt.legend()
plt.show()

print(f"Mean similarity: {similarities.mean():.3f}")
print(f"Std similarity: {similarities.std():.3f}")
print(f"Min similarity: {similarities.min():.3f}")
print(f"Max similarity: {similarities.max():.3f}")

## VBPR Training

In [None]:
import torch
torch.cuda.is_available()  # Check if True

# Force CPU
device = torch.device('cuda')

In [None]:
import os
import numpy as np
from cornac.eval_methods import BaseMethod
from cornac.models import VBPR, BPR
from cornac.eval_methods import RatioSplit
from cornac.data import ImageModality
import cornac
from cornac.metrics import Recall, NDCG, NCRR, FMeasure
import optuna

SEED = 42
VERBOSE = True

K = 10

m = [Recall(k=50), NDCG(k=50), NCRR(k=50), FMeasure(k=50)]

item_image_modality = ImageModality(features=image_features_matrix, ids=item_ids, normalized=True)

ratio_split = RatioSplit(
    data=filtered_ratings,
    test_size=0.2,
    exclude_unknowns=True,
    item_image=item_image_modality,
    verbose=VERBOSE,
    seed=SEED,
)


def objective(trial):
    # Suggest hyperparameters
    lambda_w = trial.suggest_float("lambda_w", 1e-4, 1e-2, log=True)
    lambda_b = trial.suggest_float("lambda_b", 1e-4, 1e-2, log=True)
    lambda_e = trial.suggest_float("lambda_e", 1e-5, 1e-3, log=True)
    lr = trial.suggest_float("lr", 1e-4, 5e-3, log=True)

    vbpr = VBPR(k=K, k2=K, n_epochs=2, batch_size=128, learning_rate=lr,
                lambda_w=lambda_w, lambda_b=lambda_b, lambda_e=lambda_e, use_gpu=True, verbose=VERBOSE, seed=SEED)

    experiment = cornac.Experiment(eval_method=ratio_split, models=[vbpr], metrics=m)
    experiment.run()
    # Extract F1@10 score
    print(experiment.result)

    print("Metrics:", experiment.result[0].metric_avg_results.keys())

    f1_score = experiment.result[0].metric_avg_results['F1@50']
    return f1_score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)  # You can increase the trial count

In [None]:
best_params = study.best_params
print("Best hyperparameters:", best_params)

best_vbpr = VBPR(
    k=K,
    k2=K,
    n_epochs=10,  # More epochs for final training
    batch_size=128,  # Or your preferred batch size
    learning_rate=best_params["lr"],
    lambda_w=best_params["lambda_w"],
    lambda_b=best_params["lambda_b"],
    lambda_e=best_params["lambda_e"],
    use_gpu=True,
    verbose=True,
    seed=SEED
)


final_experiment = cornac.Experiment(
    eval_method=ratio_split,  # Or use full dataset if you want final training only
    models=[best_vbpr],
    metrics=m,  # Your list of metrics
    save_dir="saved_models"
)

final_experiment.run()

print("Final evaluation results:", final_experiment.result)

In [None]:
import shutil; shutil.make_archive('saved_models', 'zip', 'saved_models'); from google.colab import files; files.download('saved_models.zip')

In [None]:
#model = VBPR.load("/content/saved_models/EASEᴿ/2025-05-13_14-48-07-822783.pkl")