In [None]:
# 1. Prvo obrišite sve postojeće instalacije
!pip uninstall numpy scikit-surprise -y

# 2. Instalirajte osnovne zavisnosti
!pip install "numpy>=1.26.0,<2.0.0" "scipy>=1.11.0" "joblib>=1.2.0"

# 3. Instalirajte surprise sa izbegavanjem konflikata
!pip install scikit-surprise --no-deps --ignore-installed

# 4. Restartujte runtime
from IPython.display import Javascript
Javascript('IPython.notebook.kernel.restart()')

# Nakon restarta, pokrenite:
import numpy as np
from surprise import Dataset, Reader, KNNWithMeans
print("Uspešna instalacija! Numpy verzija:", np.__version__)

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
[0mCollecting numpy<2.0.0,>=1.26.0
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4


Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2469561 sha256=c6ce7322f058933852ac051228c83d01e65181fc4b30f279e3899bdc7099e6be
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4
Uspešna instalacija! Numpy verzija: 1.26.4


In [None]:

import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from surprise import Dataset, Reader, KNNWithMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from tqdm import tqdm

# Hibridni sustav za preporuku glazbe (kombinira CF + CBF)

## 1. CF



In [16]:
# Ucitaj CF podatke
cf_df = pd.read_csv("user_artists.dat", sep="\t")
cf_df.columns = ["user_id", "artist_id", "plays"]
cf_df["log_plays"] = np.log1p(cf_df["plays"])
scaler = MinMaxScaler(feature_range=(1, 5))
cf_df["rating"] = scaler.fit_transform(cf_df[["log_plays"]])

# Filtriraj neaktivne korisnike
user_counts = cf_df["user_id"].value_counts()
cf_df = cf_df[cf_df["user_id"].isin(user_counts[user_counts >= 5].index)]

## 1.1 Train and test split

In [17]:
cf_train, cf_test = [], []
for user in cf_df["user_id"].unique():
    user_data = cf_df[cf_df["user_id"] == user]
    n_test = max(1, int(len(user_data) * 0.35))
    test = user_data.sample(n=n_test, random_state=42)
    train = user_data.drop(test.index)
    cf_train.append(train)
    cf_test.append(test)
cf_train = pd.concat(cf_train).reset_index(drop=True)
cf_test = pd.concat(cf_test).reset_index(drop=True)

## 1.2 Surprise CF model

In [18]:
# Surprise CF model
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(cf_train[["user_id", "artist_id", "rating"]].astype(str), reader)
trainset = train_data.build_full_trainset()
cf_model = KNNWithMeans(k=50, sim_options={"name": "pearson", "user_based": False})
cf_model.fit(trainset)


Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x788e2e83ba90>

## 2. CBF

In [19]:
cb_df = pd.read_csv("user_taggedartists.dat", sep="\t")
cb_df.columns = ["user_id", "artist_id", "tag_id", "day", "month", "year"]
artist_tags = cb_df.groupby("artist_id")["tag_id"].apply(lambda x: " ".join(map(str, x))).reset_index()


## 2.1 TF-IDF

In [21]:

# TF-IDF i cosine similarity
tfidf = TfidfVectorizer(
    min_df=3,               # Manji prag za rijetke tagove
    max_df=0.85,            # Stroži filter za česte tagove
    ngram_range=(1, 3),     # Uključuje i trojke tagova
    stop_words="english"    # Uklanjanje stop reči
)
tag_matrix = tfidf.fit_transform(artist_tags["tag_id"])
tag_similarity = cosine_similarity(tag_matrix)

# Mape ID-eva
artist_id_to_idx = {aid: idx for idx, aid in enumerate(artist_tags["artist_id"])}
idx_to_artist_id = {idx: aid for aid, idx in artist_id_to_idx.items()}


## 3. Hybrid

In [22]:
def hybrid_score(user_id, artist_id, alpha=0.5):
    user_id_str = str(user_id)
    artist_id_str = str(artist_id)

    try:
        cf_pred = cf_model.predict(user_id_str, artist_id_str).est
    except:
        cf_pred = 0

    cb_score = 0
    listened = cf_train[cf_train["user_id"] == int(user_id)]["artist_id"].unique()
    listened_idxs = [artist_id_to_idx[aid] for aid in listened if aid in artist_id_to_idx]
    target_idx = artist_id_to_idx.get(int(artist_id))
    if target_idx is not None and listened_idxs:
        cb_score = tag_similarity[listened_idxs, target_idx].mean()

    return alpha * cf_pred + (1 - alpha) * cb_score


## 4. Evaluacija

In [23]:
def precision_at_k_hybrid(k=10, alpha=0.5, threshold=3):
    user_est_true = defaultdict(list)
    for row in cf_test.itertuples():
        pred_score = hybrid_score(row.user_id, row.artist_id, alpha)
        user_est_true[row.user_id].append((pred_score, row.rating))

    precisions = []
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum(true_r >= threshold for _, true_r in user_ratings[:k])
        precisions.append(n_rel / k)

    return np.mean(precisions)

def recall_at_k_hybrid(k=10, alpha=0.5, threshold=3):
    user_est_true = defaultdict(list)
    for row in cf_test.itertuples():
        pred_score = hybrid_score(row.user_id, row.artist_id, alpha)
        user_est_true[row.user_id].append((pred_score, row.rating))

    recalls = []
    for uid, user_ratings in user_est_true.items():
        n_rel_total = sum(true_r >= threshold for _, true_r in user_ratings)
        if n_rel_total == 0:
            continue
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel_at_k = sum(true_r >= threshold for _, true_r in user_ratings[:k])
        recalls.append(n_rel_at_k / n_rel_total)

    return np.mean(recalls)

## 5. Pokretanje evaluacije

In [24]:
for k in [5, 10]:
    p = precision_at_k_hybrid(k=k, alpha=0.3)
    r = recall_at_k_hybrid(k=k, alpha=0.3)
    print(f"\nHybrid model (alpha=0.3) — k={k}")
    print(f"Precision@{k}: {p:.4f}")
    print(f"Recall@{k}:    {r:.4f}")




Hybrid model (alpha=0.3) — k=5
Precision@5: 0.2566
Recall@5:    0.4856

Hybrid model (alpha=0.3) — k=10
Precision@10: 0.2177
Recall@10:    0.7648
