## Učitavanje potrebnih libraryja

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from scipy.sparse import csr_matrix

# 1. Učitavanje i priprema podataka

## Učitaj podatke

In [2]:
df = pd.read_csv("user_taggedartists.dat", sep="\t")
df.columns = ["user_id", "artist_id", "tag_id", "day", "month", "year"]

## Priprema tagova - grupiraj tagove po izvođaču

In [3]:
artist_tags = df.groupby('artist_id')['tag_id'].apply(lambda x: ' '.join(map(str, x))).reset_index()

## Filtriranje i transformacija

In [4]:
MIN_TAG_COUNT = 5  # Ukloni tagove s manje od 5 pojavljivanja
tag_counts = df['tag_id'].value_counts()
valid_tags = tag_counts[tag_counts >= MIN_TAG_COUNT].index
df = df[df['tag_id'].isin(valid_tags)]

MIN_ARTIST_TAGS = 3  # Ukloni izvođače s manje od 3 tagova
artist_tag_counts = df['artist_id'].value_counts()
valid_artists = artist_tag_counts[artist_tag_counts >= MIN_ARTIST_TAGS].index
df = df[df['artist_id'].isin(valid_artists)]

# 2. TF-IDF transformacija tagova

In [5]:
tfidf = TfidfVectorizer(
    min_df=5,               # Ignoriraj tagove s manje od 5 dokumenata
    max_df=0.90,            # Ignoriraj tagove prisutne u >85% izvođača
    ngram_range=(1, 2)      # Uključi i kombinacije od 2 tagova
)
tag_matrix = tfidf.fit_transform(artist_tags['tag_id'])

# Mape za pretvorbu ID-jeva
artist_id_to_idx = {aid: idx for idx, aid in enumerate(artist_tags['artist_id'])}
idx_to_artist_id = {idx: aid for aid, idx in artist_id_to_idx.items()}

# Matrica sličnosti temeljena na tagovima (spremamo kao rijetku matricu)
tag_similarity = cosine_similarity(tag_matrix)
tag_similarity = csr_matrix(tag_similarity)

# 3. Preporučivanje

## 3.1 Funkcija za traženje preporuke

In [6]:
def get_tag_based_recommendations(user_id, train_df, top_n=10):
    # Dohvati sve izvođače koje je korisnik slušao
    listened_artists = train_df[train_df["user_id"] == user_id]["artist_id"].unique()
    listened_indices = [artist_id_to_idx[aid] for aid in listened_artists if aid in artist_id_to_idx]

    if not listened_indices:
        return []

    # Pretvori u numpy array zbog brzine
    listened_indices = np.array(listened_indices)

    # Izračunaj prosječnu sličnost sa svim izvođačima
    sim_scores = tag_similarity[listened_indices].mean(axis=0).A1  # .A1 pretvara u 1D numpy array

    # Postavi sličnost za već slušane izvođače na -inf
    sim_scores[listened_indices] = -np.inf

    # Dohvati top N preporuka
    top_indices = np.argpartition(sim_scores, -top_n)[-top_n:]
    top_indices = top_indices[np.argsort(sim_scores[top_indices])[::-1]]

    return [(idx_to_artist_id[idx], sim_scores[idx]) for idx in top_indices]

## 3.2 Priprema podataka za evaluaciju

In [7]:
df_unique = df[["user_id", "artist_id"]].drop_duplicates()
user_counts = df_unique["user_id"].value_counts()
df_unique = df_unique[df_unique["user_id"].isin(user_counts[user_counts >= 10].index)]

## 3.3 Odvajanje skupa za treniranje i testiranje

In [8]:
def per_user_split_cb(df, test_frac=0.35):
    train, test = [], []
    for user_id in df["user_id"].unique():
        user_data = df[df["user_id"] == user_id]
        n_test = max(1, int(len(user_data) * test_frac))
        test_data = user_data.sample(n=n_test, random_state=42)
        train_data = user_data.drop(test_data.index)
        train.append(train_data)
        test.append(test_data)
    return pd.concat(train).reset_index(drop=True), pd.concat(test).reset_index(drop=True)

train_cb, test_cb = per_user_split_cb(df_unique)


## 3.4 Funkcija za računanje precizosti za k

In [9]:
def precision_at_k_cb(train_df, test_df, k=10):
    precisions = []
    user_ids = test_df["user_id"].unique()
    for user_id in tqdm(user_ids, desc="Evaluating Tag-Based Recommendations"):
        true_artists = set(test_df[test_df["user_id"] == user_id]["artist_id"].astype(int))
        recs = get_tag_based_recommendations(user_id, train_df, top_n=k)
        recommended_artists = [aid for aid, _ in recs]

        if not recommended_artists:
            continue

        n_rel = sum(artist in true_artists for artist in recommended_artists)
        precision = n_rel / k
        precisions.append(precision)

    return np.mean(precisions)

## 3.5 Recalll

In [11]:
def recall_at_k_cb(train_df, test_df, k=10):
    recalls = []
    user_ids = test_df["user_id"].unique()

    for user_id in tqdm(user_ids, desc="Evaluating Recall"):
        true_artists = set(test_df[test_df["user_id"] == user_id]["artist_id"].astype(int))
        if not true_artists:
            continue

        recs = get_tag_based_recommendations(user_id, train_df, top_n=k)
        recommended_artists = [aid for aid, _ in recs]

        n_rel = sum(artist in true_artists for artist in recommended_artists)
        recall = n_rel / len(true_artists)
        recalls.append(recall)

    return np.mean(recalls)

## 4. Evoluacija

In [12]:
def evaluate_model(train_df, test_df, k_values=[5, 10]):
    results = {}

    for k in k_values:
        print(f"\nEvaluating for k={k}...")
        precision = precision_at_k_cb(train_df, test_df, k=k)
        recall = recall_at_k_cb(train_df, test_df, k=k)

        results[f'P@{k}'] = precision
        results[f'R@{k}'] = recall

        print(f"Precision@{k}: {precision:.4f}")
        print(f"Recall@{k}: {recall:.4f}")

    return results

# Pokreni evaluaciju
results = evaluate_model(train_cb, test_cb)


Evaluating for k=5...


Evaluating Tag-Based Recommendations: 100%|██████████| 798/798 [00:03<00:00, 209.44it/s]
Evaluating Recall: 100%|██████████| 798/798 [00:04<00:00, 183.59it/s]


Precision@5: 0.2376
Recall@5: 0.0772

Evaluating for k=10...


Evaluating Tag-Based Recommendations: 100%|██████████| 798/798 [00:05<00:00, 147.21it/s]
Evaluating Recall: 100%|██████████| 798/798 [00:03<00:00, 241.32it/s]

Precision@10: 0.1914
Recall@10: 0.1168



