In [2]:
# Prvo obrišite postojeće verzije ako postoje
!pip uninstall numpy scikit-surprise -y

# Instalirajte specifične verzije koje su kompatibilne
!pip install "numpy==1.26.4" "scipy<2.0" "joblib>=1.2.0"

# Instalirajte surprise bez dodatnih zavisnosti
!pip install --no-deps scikit-surprise

# Proverite instalaciju
!pip show scikit-surprise

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
[0mCollecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2469559 sha256=5462d01024a2d9bb2a853206e477e1ff340da9c26c1edfd20ef82c6e662c7239
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succes

## Učitaj potrebne libraryje

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import SVD, Dataset, Reader, accuracy
from surprise import KNNWithMeans

from collections import defaultdict


# 1. Učitavanje i priprema podataka

### Učitaj podatke

In [3]:
# Load and prepare data
dataset_cf = pd.read_csv('user_artists.dat', sep='\t')
dataset_cf.columns = ['user_id', 'artist_id', 'plays']

### Transformacije podataka

In [4]:
# 1. Log transformation
dataset_cf["log_plays"] = np.log1p(dataset_cf["plays"])

# 2. Scale to 1-5 range
scaler = MinMaxScaler(feature_range=(1, 5))
dataset_cf["rating"] = scaler.fit_transform(dataset_cf[["log_plays"]])

# 3. Filter inactive users
user_counts = dataset_cf['user_id'].value_counts()
dataset_cf = dataset_cf[dataset_cf['user_id'].isin(user_counts[user_counts >= 8].index)]

### Stvaranje seta za treniranje i testiranje(65:35)


In [5]:
# Split data
def per_user_split(df, test_frac=0.35):
    train, test = [], []
    for user_id in df['user_id'].unique():
        user_data = df[df['user_id'] == user_id]
        n_test = max(1, int(len(user_data) * test_frac))
        test_data = user_data.sample(n_test)
        train_data = user_data.drop(test_data.index)
        train.append(train_data)
        test.append(test_data)
    return pd.concat(train), pd.concat(test)

train_cf, test_cf = per_user_split(dataset_cf[["user_id", "artist_id", "rating"]], test_frac=0.35)

### Prilagodi podatke

In [6]:
# Convert IDs to strings
train_cf["user_id"] = train_cf["user_id"].astype(str)
train_cf["artist_id"] = train_cf["artist_id"].astype(str)
test_cf["user_id"] = test_cf["user_id"].astype(str)
test_cf["artist_id"] = test_cf["artist_id"].astype(str)


reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train_cf[["user_id", "artist_id", "rating"]], reader)
trainset = train_data.build_full_trainset()

# 2. Definicija KNN modela i treniranje

In [7]:
# Train model
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson', 'user_based': False})
algo.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7d1f1f380750>

# 3. Testiranje

In [8]:
# Convert test_cf to Surprise-compatible testset
testset = list(zip(
    test_cf["user_id"].values,
    test_cf["artist_id"].values,
    test_cf["rating"].values
))

# Generate predictions
predictions = algo.test(testset)

### Precision@k and RMSE

In [9]:
def precision_at_k(predictions, k=10, threshold=3.0):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = []
    for uid, user_ratings in user_est_true.items():
        # Sortiraj predviđanja po procijenjenoj vrijednosti (silazno)
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Broj relevantnih stavki u top-k
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings[:k])

        # Precision@k za ovog korisnika
        precisions.append(n_rel / k)

    return np.mean(precisions)

In [10]:
def recall_at_k(predictions, k=10, threshold=3.0):
    """Izračunava Recall@k - koliko relevantnih stavki je preporučeno"""
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    recalls = []
    for uid, user_ratings in user_est_true.items():
        # Sortiraj predviđanja po procijenjenoj vrijednosti
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Broj relevantnih stavki za korisnika (ground truth)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        if n_rel == 0:  # Ako nema relevantnih, preskoči
            continue

        # Broj relevantnih u top-k preporuka
        n_rel_and_rec_k = sum((true_r >= threshold) for (_, true_r) in user_ratings[:k])

        recalls.append(n_rel_and_rec_k / n_rel)

    return np.mean(recalls) if recalls else 0.0

In [11]:
# Ažurirana evaluaciona funkcija
def evaluate_model(predictions, k_values=[5, 10], threshold=3.0):
    """Evaluacija sa Precision@k i Recall@k"""
    results = {}

    # RMSE
    results['RMSE'] = accuracy.rmse(predictions, verbose=False)

    for k in k_values:
        # Precision
        precision = precision_at_k(predictions, k=k, threshold=threshold)
        results[f'P@{k}'] = precision

        # Recall
        recall = recall_at_k(predictions, k=k, threshold=threshold)
        results[f'R@{k}'] = recall

    return results

# Pokreni evaluaciju
print("\nEvaluation Metrics:")
eval_results = evaluate_model(predictions, k_values=[5, 10])

# Prikaz rezultata
print(f"RMSE: {eval_results['RMSE']:.4f}")
print(f"Precision@5: {eval_results['P@5']:.4f} | Recall@5: {eval_results['R@5']:.4f} ")
print(f"Precision@10: {eval_results['P@10']:.4f} | Recall@10: {eval_results['R@10']:.4f} ")


Evaluation Metrics:
RMSE: 0.4090
Precision@5: 0.2321 | Recall@5: 0.4221 
Precision@10: 0.2090 | Recall@10: 0.7135 
