# Multiple Recommendation Models Comparison
LightFM, SVD, NMF, KNN 비교

In [1]:
from pathlib import Path
PROJECT_ROOT = Path(__file__).parent.parent if '__file__' in globals() else Path.cwd().parent

In [2]:
import pandas as pd
import numpy as np
import pickle
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score
from sklearn.decomposition import NMF
from scipy.sparse.linalg import svds
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split



In [3]:
with open(PROJECT_ROOT / "data" / "stg" / "mxmh_with_spotify_preprocessed.pkl", "rb") as f:
    data = pickle.load(f)

In [4]:
interaction_matrix = data["interaction_matrix"]
item_features = data["item_features"]
user_features = data["user_features"]
genre_to_idx = data["genre_to_idx"]
idx_to_genre = data["idx_to_genre"]
mxmh_genres = data["mxmh_genres"]

In [5]:
print(f"Interaction matrix: {interaction_matrix.shape}")
print(f"Item features: {item_features.shape}")
print(f"User features: {user_features.shape}")
print(f"Genres: {mxmh_genres}")

Interaction matrix: (736, 14)
Item features: (14, 20)
User features: (736, 4)
Genres: ['Classical', 'Country', 'EDM', 'Folk', 'Gospel', 'Hip hop', 'Jazz', 'K pop', 'Latin', 'Lofi', 'Metal', 'Pop', 'R&B', 'Rock']


In [6]:
def create_train_test_split(interaction_matrix, test_ratio=0.2, random_state=42):
    """
    User-item interaction을 train/test로 분할
    """
    np.random.seed(random_state)
    
    train_matrix = interaction_matrix.copy().tolil()
    test_matrix = csr_matrix(interaction_matrix.shape, dtype='float32').tolil()
    
    for user_idx in range(interaction_matrix.shape[0]):
        user_items = interaction_matrix[user_idx].nonzero()[1]
        
        if len(user_items) > 1:
            n_test = max(1, int(len(user_items) * test_ratio))
            test_items = np.random.choice(user_items, size=n_test, replace=False)
            
            for item_idx in test_items:
                test_matrix[user_idx, item_idx] = interaction_matrix[user_idx, item_idx]
                train_matrix[user_idx, item_idx] = 0
    
    return train_matrix.tocsr(), test_matrix.tocsr()

In [7]:
train_matrix, test_matrix = create_train_test_split(interaction_matrix, test_ratio=0.2)

In [9]:
print(f"Train matrix: {train_matrix.shape}, nnz={train_matrix.nnz}")
print(f"Test matrix: {test_matrix.shape}, nnz={test_matrix.nnz}")

Train matrix: (736, 14), nnz=9264
Test matrix: (736, 14), nnz=1040


In [10]:
def calculate_metrics(model_name, predictions, test_matrix, k=5):
    """
    Precision@K, Recall@K, NDCG@K 계산
    """
    metrics = {
        'model': model_name,
        'precision@5': [],
        'recall@5': [],
        'ndcg@5': []
    }
    
    for user_idx in range(test_matrix.shape[0]):
        test_items = test_matrix[user_idx].nonzero()[1]
        
        if len(test_items) == 0:
            continue
        
        user_predictions = predictions[user_idx]
        top_k_items = np.argsort(user_predictions)[-k:][::-1]
        
        hits = len(set(top_k_items) & set(test_items))
        
        precision = hits / k
        recall = hits / len(test_items) if len(test_items) > 0 else 0
        
        dcg = sum([1 / np.log2(idx + 2) for idx, item in enumerate(top_k_items) if item in test_items])
        idcg = sum([1 / np.log2(idx + 2) for idx in range(min(k, len(test_items)))])
        ndcg = dcg / idcg if idcg > 0 else 0
        
        metrics['precision@5'].append(precision)
        metrics['recall@5'].append(recall)
        metrics['ndcg@5'].append(ndcg)
    
    return {
        'model': model_name,
        'precision@5': np.mean(metrics['precision@5']),
        'recall@5': np.mean(metrics['recall@5']),
        'ndcg@5': np.mean(metrics['ndcg@5'])
    }

## LightFM

In [12]:
lightfm_model = LightFM(
    loss='warp',
    no_components=30,
    learning_rate=0.05,
    random_state=42
)

In [13]:
lightfm_model.fit(
    train_matrix,
    user_features=user_features,
    item_features=item_features,
    epochs=50,
    num_threads=4,
    verbose=True
)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49


<lightfm.lightfm.LightFM at 0x11f0802f0>

In [14]:
lightfm_predictions = []
for user_idx in range(train_matrix.shape[0]):
    scores = lightfm_model.predict(
        user_idx,
        np.arange(train_matrix.shape[1]),
        user_features=user_features,
        item_features=item_features
    )
    lightfm_predictions.append(scores)

In [15]:
lightfm_predictions = np.array(lightfm_predictions)
lightfm_metrics = calculate_metrics("LightFM", lightfm_predictions, test_matrix)

In [17]:
print(f"LightFM Results:")
print(f"  Precision@5: {lightfm_metrics['precision@5']:.4f}")
print(f"  Recall@5: {lightfm_metrics['recall@5']:.4f}")
print(f"  NDCG@5: {lightfm_metrics['ndcg@5']:.4f}")

LightFM Results:
  Precision@5: 0.1005
  Recall@5: 0.3627
  NDCG@5: 0.2336


## SVD

In [18]:
n_factors = 30
train_dense = train_matrix.toarray()

U, sigma, Vt = svds(train_dense, k=n_factors)
sigma = np.diag(sigma)

svd_predictions = np.dot(np.dot(U, sigma), Vt)
svd_metrics = calculate_metrics("SVD", svd_predictions, test_matrix)

ValueError: `k` must be an integer satisfying `0 < k < min(A.shape)`.

## NMF

In [19]:
nmf_model = NMF(
    n_components=30,
    init='random',
    random_state=42,
    max_iter=200
)

W = nmf_model.fit_transform(train_matrix)
H = nmf_model.components_
nmf_predictions = np.dot(W, H)

nmf_metrics = calculate_metrics("NMF", nmf_predictions, test_matrix)

print(f"\nNMF Results:")
print(f"  Precision@5: {nmf_metrics['precision@5']:.4f}")
print(f"  Recall@5: {nmf_metrics['recall@5']:.4f}")
print(f"  NDCG@5: {nmf_metrics['ndcg@5']:.4f}")



NMF Results:
  Precision@5: 0.0049
  Recall@5: 0.0246
  NDCG@5: 0.0102


In [20]:
print("\n" + "="*60)
print("Model 4: User-based KNN")
print("="*60)

knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
knn_model.fit(train_matrix)

distances, indices = knn_model.kneighbors(train_matrix, n_neighbors=20)

knn_predictions = np.zeros(train_matrix.shape)
for user_idx in range(train_matrix.shape[0]):
    neighbor_indices = indices[user_idx][1:]
    neighbor_ratings = train_matrix[neighbor_indices].toarray()
    knn_predictions[user_idx] = np.mean(neighbor_ratings, axis=0)

knn_metrics = calculate_metrics("KNN", knn_predictions, test_matrix)

print(f"\nKNN Results:")
print(f"  Precision@5: {knn_metrics['precision@5']:.4f}")
print(f"  Recall@5: {knn_metrics['recall@5']:.4f}")
print(f"  NDCG@5: {knn_metrics['ndcg@5']:.4f}")


Model 4: User-based KNN

KNN Results:
  Precision@5: 0.0290
  Recall@5: 0.1318
  NDCG@5: 0.0591
