## Učitaj potrebne libraryje

In [31]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import SVD, Dataset, Reader, accuracy
from surprise import KNNWithMeans

from collections import defaultdict


# 1. Učitavanje i priprema podataka

### Učitaj podatke

In [32]:
# Load and prepare data
dataset_cf = pd.read_csv('user_artists.dat', sep='\t')
dataset_cf.columns = ['user_id', 'artist_id', 'plays']

### Transformacije podataka

In [33]:
# 1. Log transformation
dataset_cf["log_plays"] = np.log1p(dataset_cf["plays"])

# 2. Scale to 1-5 range
scaler = MinMaxScaler(feature_range=(1, 5))
dataset_cf["rating"] = scaler.fit_transform(dataset_cf[["log_plays"]])

# 3. Filter inactive users
user_counts = dataset_cf['user_id'].value_counts()
dataset_cf = dataset_cf[dataset_cf['user_id'].isin(user_counts[user_counts >= 5].index)]

### Stvaranje seta za treniranje i testiranje(65:35)


In [34]:
# Split data
def per_user_split(df, test_frac=0.2):
    train, test = [], []
    for user_id in df['user_id'].unique():
        user_data = df[df['user_id'] == user_id]
        n_test = max(1, int(len(user_data) * test_frac))
        test_data = user_data.sample(n_test)
        train_data = user_data.drop(test_data.index)
        train.append(train_data)
        test.append(test_data)
    return pd.concat(train), pd.concat(test)

train_cf, test_cf = per_user_split(dataset_cf[["user_id", "artist_id", "rating"]], test_frac=0.35)

### Prilagodi podatke

In [35]:
# Convert IDs to strings
train_cf["user_id"] = train_cf["user_id"].astype(str)
train_cf["artist_id"] = train_cf["artist_id"].astype(str)
test_cf["user_id"] = test_cf["user_id"].astype(str)
test_cf["artist_id"] = test_cf["artist_id"].astype(str)


reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train_cf[["user_id", "artist_id", "rating"]], reader)
trainset = train_data.build_full_trainset()

# 2. Definicija KNN modela i treniranje

In [36]:
# Train model
algo = KNNWithMeans(k=15, sim_options={'name': 'cosine', 'user_based': False})
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7c86c4e3a590>

# 3. Testiranje

In [38]:
# Convert test_cf to Surprise-compatible testset
testset = list(zip(
    test_cf["user_id"].values,
    test_cf["artist_id"].values,
    test_cf["rating"].values
))

# Generate predictions
predictions = algo.test(testset)  # Now works!

### Precision@k and RMSE

In [39]:
# 1. Definiraj funkciju za Precision@k
def precision_at_k(predictions, k=10, threshold=3.0):
    """
    Računa Precision@k za liste predviđanja.

    Args:
        predictions: Surprise predictions lista (kroz algo.test())
        k: Broj preporuka koje evaluiramo
        threshold: Prag iznad kojeg smatramo da je stavka "relevantna"
    """
    # Grupiraj predviđanja po korisniku
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = []
    for uid, user_ratings in user_est_true.items():
        # Sortiraj predviđanja po procijenjenoj vrijednosti (silazno)
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Broj relevantnih stavki u top-k
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings[:k])

        # Precision@k za ovog korisnika
        precisions.append(n_rel / k)

    return np.mean(precisions)

In [40]:
print("\nEvaluation Metrics:")
print(f"RMSE: {accuracy.rmse(predictions, verbose=False):.4f}")
print(f"Precision@5: {precision_at_k(predictions, k=5):.4f}")
print(f"Precision@10: {precision_at_k(predictions, k=10):.4f}")


Evaluation Metrics:
RMSE: 0.3780
Precision@5: 0.2273
Precision@10: 0.2079
