In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import normalize

In [2]:
user_artists = pd.read_csv('/kaggle/input/version-1/user_artists.dat', sep='\t')
artists = pd.read_csv('/kaggle/input/version-1/artists.dat', sep='\t', usecols=['id', 'name'])

user_artists.columns = ['user_id', 'artist_id', 'weight']
artists.columns = ['artist_id', 'artist_name']

data = pd.merge(user_artists, artists, on='artist_id')
print(data.head())

# Clean dataset
data.isnull().sum()

   user_id  artist_id  weight    artist_name
0        2         51   13883    Duran Duran
1        2         52   11690      Morcheeba
2        2         53   11351            Air
3        2         54   10300   Hooverphonic
4        2         55    8983  Kylie Minogue


user_id        0
artist_id      0
weight         0
artist_name    0
dtype: int64

In [3]:
user_counts = data['user_id'].value_counts()
artist_counts = data['artist_id'].value_counts()

active_users = user_counts[user_counts > 20].index
popular_artists = artist_counts[artist_counts > 20].index

filtered_data = data[
    (data['user_id'].isin(active_users)) &
    (data['artist_id'].isin(popular_artists))
]

In [4]:
user_item_matrix = filtered_data.pivot_table(
    index='user_id',
    columns='artist_id',
    values='weight',
    fill_value=0
)
user_item_matrix.head()

artist_id,7,9,15,25,30,45,51,52,53,55,...,5150,5416,5926,5988,6120,6217,6453,7324,7340,8589
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,13883.0,11690.0,11351.0,8983.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,228.0,0.0,686.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,181.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
non_zero = user_item_matrix.astype(bool).sum().sum()
total = user_item_matrix.shape[0] * user_item_matrix.shape[1]

sparsity = 1 - (non_zero / total)
print(f"Sparsity: {sparsity:.4f} ({sparsity * 100:.2f}%)")

Sparsity: 0.9630 (96.30%)


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

In [7]:
def get_user_recommendations(target_user_id, n_recommendations=10):
    similar_users = user_similarity_df[target_user_id].sort_values(ascending=False)[1:11]

    weighted_scores = user_item_matrix.loc[similar_users.index].T.dot(similar_users)

    user_listened = user_item_matrix.loc[target_user_id]

    listened_artist_ids = user_listened[user_listened > 0].index

    weighted_scores = weighted_scores.drop(listened_artist_ids, errors='ignore')

    top_artists = weighted_scores.sort_values(ascending=False).head(n_recommendations)

    top_artist_ids = top_artists.index
    recommendations = artists[artists['artist_id'].isin(top_artist_ids)]

    return recommendations

In [8]:
get_user_recommendations(target_user_id=8)

Unnamed: 0,artist_id,artist_name
61,67,Madonna
288,294,Leona Lewis
293,299,Jennifer Lopez
319,325,Ashley Tisdale
327,333,Avril Lavigne
455,461,Miley Cyrus
460,466,Ke$ha
517,523,Lindsay Lohan
695,701,Shakira
2528,2548,Wanessa


In [9]:
from sklearn.model_selection import train_test_split
import numpy as np

interactions = filtered_data[['user_id', 'artist_id', 'weight']]

train, test = train_test_split(interactions, test_size=0.2, random_state=6)

train_matrix = train.pivot(index='user_id', columns='artist_id', values='weight').fillna(0)
test_matrix = test.pivot(index='user_id', columns='artist_id', values='weight').fillna(0)


In [10]:
from sklearn.metrics import mean_squared_error

user_similarity_train = cosine_similarity(train_matrix)
sim_df = pd.DataFrame(user_similarity_train, index=train_matrix.index, columns=train_matrix.index)

def predict(user_id):
    sim_users = sim_df[user_id].drop(user_id)
    sim_users = sim_users[sim_users > 0]
    if sim_users.empty:
        return np.zeros(train_matrix.shape[1])
    weighted_sum = train_matrix.loc[sim_users.index].T.dot(sim_users)
    norm = sim_users.sum()
    return weighted_sum / norm

user_id = train_matrix.index[0]
predicted = predict(user_id)
actual = test_matrix.loc[user_id]

common = actual[actual > 0].index
rmse = np.sqrt(mean_squared_error(actual[common], predicted[common]))
print("RMSE:", rmse)


RMSE: 985.5682734159731


In [11]:
for user_id in [46, 52, 51]:
    print(f"Рекомендации для пользователя {user_id}:")
    print(get_user_recommendations(user_id))
    print("------")

Рекомендации для пользователя 46:
      artist_id     artist_name
61           67         Madonna
251         257    Mariah Carey
287         293  Ashlee Simpson
288         294     Leona Lewis
304         310   Nelly Furtado
325         331      Kanye West
541         547        Kid Cudi
2077       2092            소녀시대
2079       2094             BoA
2970       2990       Shontelle
------
Рекомендации для пользователя 52:
      artist_id           artist_name
464         470      nevershoutnever!
677         683            John Mayer
681         687          All Time Low
770         779  Bring Me The Horizon
814         823      August Burns Red
1026       1035      Breathe Carolina
1091       1100               Chiodos
1171       1180       Escape The Fate
1844       1853          blessthefall
2599       2619           VersaEmerge
------
Рекомендации для пользователя 51:
      artist_id          artist_name
704         710  Black Label Society
834         843              Pantera
845