In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix, lil_matrix
import warnings
warnings.filterwarnings('ignore')


In [7]:
df = pd.read_csv('sample_data/kz.csv')

In [9]:
# if 'event_type' in df.columns:
#     df = df[df['event_type'] == 'purchase']
df = df.rename(columns={'user_id': 'userId', 'product_id': 'movieId', 'event_time': 'timestamp'})
df = df.drop_duplicates(subset=['order_id', 'movieId'])
df = df.dropna(subset=['userId', 'movieId'])
df['timestamp'] = pd.to_datetime(df['timestamp'])


<br1>Наивная рекомендация</br1>

In [10]:
# Определяем k на основе медианного числа товаров повторного клиента
repeat_customers = df.groupby('userId').size()
k = int(repeat_customers.median())
k = max(3, min(k, 10))  # Ограничиваем k от 3 до 10
print(f'Selected k: {k}')

# Разделяем данные по дате (2020-09-01)
train_data = df[df['timestamp'] < '2020-09-01']
test_data = df[df['timestamp'] >= '2020-09-01']

def naive_popularity_based(train_data, k):
    popular_items = train_data['movieId'].value_counts().head(k).index.tolist()
    return popular_items

# Наивный алгоритм 2: Любимый товар клиента + (k-1) самых популярных
def naive_hybrid_based(train_data, k):
    # Самые популярные товары
    popular_items = train_data['movieId'].value_counts().head(k).index.tolist()

    # Любимый товар каждого клиента (самый частый)
    user_favorites = train_data.groupby('userId')['movieId'].apply(
        lambda x: x.value_counts().index[0] if len(x.value_counts()) > 0 else popular_items[0]
    ).to_dict()

    recommendations = {}
    for user in set(train_data['userId'].unique()) | set(test_data['userId'].unique()):
        if user in user_favorites:
            favorite = user_favorites[user]
            # Убедимся, что favorite не входит в popular_items
            user_rec = [favorite] + [item for item in popular_items if item != favorite][:k-1]
        else:
            # Если пользователя нет в тренировочных данных, используем популярные товары
            user_rec = popular_items[:k]
        recommendations[user] = user_rec[:k]

    return recommendations

Selected k: 3


In [11]:
def evaluate_recommendations(recommendations, test_data, k):
    precisions = []
    recalls = []
    f1_scores = []

    # Группируем тестовые данные по пользователям
    test_user_items = test_data.groupby('userId')['movieId'].apply(set).to_dict()

    for user, actual_items in test_user_items.items():
        if user in recommendations:
            recommended_items = set(recommendations[user][:k])
        else:
            # Если пользователя нет в рекомендациях, пропускаем
            continue

        if len(recommended_items) == 0:
            precision = recall = f1 = 0
        else:
            # Вычисляем precision и recall
            true_positives = len(recommended_items & actual_items)
            precision = true_positives / len(recommended_items)
            recall = true_positives / len(actual_items) if len(actual_items) > 0 else 0

            # Вычисляем F1-score
            if precision + recall > 0:
                f1 = 2 * (precision * recall) / (precision + recall)
            else:
                f1 = 0

        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

    # Усредняем метрики по всем пользователям
    avg_precision = np.mean(precisions) if precisions else 0
    avg_recall = np.mean(recalls) if recalls else 0
    avg_f1 = np.mean(f1_scores) if f1_scores else 0

    return avg_precision, avg_recall, avg_f1




In [12]:
popular_items = naive_popularity_based(train_data, k)
popular_recommendations = {user: popular_items for user in test_data['userId'].unique()}
hybrid_recommendations = naive_hybrid_based(train_data, k)
# hybrid_recommendations = naive_hybrid_based(train_data, k)
# random_recommendations = naive_random_based(train_data, k)

precision_pop, recall_pop, f1_pop  = evaluate_recommendations(popular_recommendations, test_data, 3)
# f1_hybrid = evaluate_naive_method(hybrid_recommendations, test_data)
# f1_random = evaluate_naive_method(random_recommendations, test_data)
precision_hyb, recall_hyb, f1_hyb = evaluate_recommendations(hybrid_recommendations, test_data, k)
print(f'Popularity Based - Precision: {precision_pop:.4f}, Recall: {recall_pop:.4f}, F1: {f1_pop:.4f}')
print(f'Hybrid Based - Precision: {precision_hyb:.4f}, Recall: {recall_hyb:.4f}, F1: {f1_hyb:.4f}')
# print(f'F1-Score Hybrid Based: {f1_hybrid:.4f}')
# print(f'F1-Score Random Based: {f1_random:.4f}')

Popularity Based - Precision: 0.0066, Recall: 0.0068, F1: 0.0052
Hybrid Based - Precision: 0.0114, Recall: 0.0155, F1: 0.0107


<br1>Продвинутые методы</br1>

In [13]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix, lil_matrix

In [14]:
def create_sparse_matrix(data, user_col='userId', item_col='movieId'):

    # Создаем mapping пользователей и товаров
    users = sorted(data[user_col].unique())
    items = sorted(data[item_col].unique())

    user_to_idx = {user: idx for idx, user in enumerate(users)}
    item_to_idx = {item: idx for idx, item in enumerate(items)}

    # Создаем разреженную матрицу в формате LIL (более эффективно для построения)
    matrix = lil_matrix((len(users), len(items)), dtype=np.float32)

    for _, row in data.iterrows():
        user_idx = user_to_idx[row[user_col]]
        item_idx = item_to_idx[row[item_col]]
        matrix[user_idx, item_idx] = 1  # Бинарные данные

    return matrix.tocsr(), users, items, user_to_idx, item_to_idx

In [15]:
train_sparse, train_users, train_items, train_user_map, train_item_map = create_sparse_matrix(train_data)
test_sparse, test_users, test_items, test_user_map, test_item_map = create_sparse_matrix(test_data)

print(f"Разреженная матрица обучения: {train_sparse.shape}, заполненность: {train_sparse.nnz / (train_sparse.shape[0] * train_sparse.shape[1]):.4f}%")
print(f"Разреженная матрица теста: {test_sparse.shape}, заполненность: {test_sparse.nnz / (test_sparse.shape[0] * test_sparse.shape[1]):.4f}%")

Разреженная матрица обучения: (80201, 16636), заполненность: 0.0002%
Разреженная матрица теста: (34942, 14976), заполненность: 0.0004%


In [None]:
def knn_recommendations(sparse_matrix, k_neighbors=50, k_recommendations=10):
    """Рекомендации на основе k ближайших соседей"""
    # Используем k-NN для нахождения похожих пользователей
    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=k_neighbors)
    knn.fit(sparse_matrix)

    # Получаем расстояния и индексы соседей
    distances, indices = knn.kneighbors(sparse_matrix)

    # Преобразуем расстояния в веса (близость)
    weights = 1 - distances

    recommendations = []
    for i in range(sparse_matrix.shape[0]):
        # Взвешенная сумма оценок соседей
        neighbor_weights = weights[i]
        neighbor_indices = indices[i]

        # Суммируем покупки соседей с весами
        weighted_sum = sparse_matrix[neighbor_indices].multiply(neighbor_weights[:, np.newaxis]).sum(axis=0)

        # Исключаем товары, которые пользователь уже покупал
        user_purchases = sparse_matrix[i].nonzero()[1]
        if len(user_purchases) > 0:
            weighted_sum[0, user_purchases] = 0

        # Получаем топ-k рекомендаций
        if weighted_sum.sum() > 0:
            top_items = np.argsort(weighted_sum.A1)[-k_recommendations:][::-1]
        else:
            # Если нет рекомендаций, используем популярные товары
            item_popularity = sparse_matrix.sum(axis=0).A1
            top_items = np.argsort(item_popularity)[-k_recommendations:][::-1]

        recommendations.append(top_items)

    return recommendations

In [16]:
def simple_collaborative_filtering(train_sparse, k_recommendations=10):
        """Упрощенная коллаборативная фильтрация на основе общих покупок"""
        # Вычисляем similarity как количество общих покупок
        similarity = train_sparse.dot(train_sparse.T)

        recommendations = []
        for i in range(train_sparse.shape[0]):
            # Находим наиболее похожих пользователей
            user_similarities = similarity[i].toarray().flatten()

            # Исключаем самого пользователя
            user_similarities[i] = 0

            if user_similarities.sum() > 0:
                # Взвешенная сумма покупок похожих пользователей
                similar_users = user_similarities.argsort()[-50:][::-1]  # Топ-50 похожих
                weights = user_similarities[similar_users]

                # ИСПРАВЛЕНИЕ: убираем .A1, так как результат уже массив
                # Используем toarray().flatten() для преобразования разреженной матрицы в массив
                weighted_sum = train_sparse[similar_users].T.dot(weights).flatten()

                # Исключаем уже купленные товары
                user_purchases = train_sparse[i].nonzero()[1]
                if len(user_purchases) > 0:
                    weighted_sum[user_purchases] = 0

                # Получаем рекомендации
                if weighted_sum.sum() > 0:
                    top_items = np.argsort(weighted_sum)[-k_recommendations:][::-1]
                else:
                    item_popularity = train_sparse.sum(axis=0).flatten()
                    top_items = np.argsort(item_popularity)[-k_recommendations:][::-1]
            else:
                # Если нет похожих пользователей, используем популярные товары
                item_popularity = train_sparse.sum(axis=0).flatten()
                top_items = np.argsort(item_popularity)[-k_recommendations:][::-1]

            recommendations.append(top_items)

        return recommendations

In [17]:
def evaluate_advanced_method(recommendations, test_sparse, item_map, k):

    precisions = []
    recalls = []
    f1_scores = []

    for i in range(test_sparse.shape[0]):
        # Реальные покупки пользователя в тесте
        actual_items = set(test_sparse[i].nonzero()[1])

        if i < len(recommendations):
            recommended_items = set(recommendations[i][:k])
        else:
            recommended_items = set()

        if len(recommended_items) == 0:
            precision = recall = f1 = 0
        else:
            true_positives = len(recommended_items & actual_items)
            precision = true_positives / len(recommended_items)
            recall = true_positives / len(actual_items) if len(actual_items) > 0 else 0

            if precision + recall > 0:
                f1 = 2 * (precision * recall) / (precision + recall)
            else:
                f1 = 0

        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

    avg_precision = np.mean(precisions) if precisions else 0
    avg_recall = np.mean(recalls) if recalls else 0
    avg_f1 = np.mean(f1_scores) if f1_scores else 0

    return avg_precision, avg_recall, avg_f1

In [None]:
knn_recs = knn_recommendations(train_sparse, k_recommendations=k)
precision_knn, recall_knn, f1_knn = evaluate_advanced_method(knn_recs, test_sparse, train_item_map, k)
print(f" f1 - k-NN: {f1_knn:.4f}")
print(f" prec - k-NN: {precision_knn:.4f}")


NameError: name 'knn_recommendations' is not defined

In [None]:
cf_recs = simple_collaborative_filtering(train_sparse, k_recommendations=k)
precision_cf, recall_cf, f1_cf = evaluate_advanced_method(cf_recs, test_sparse, train_item_map, k)
print(f" f1 - CF: {f1_cf:.4f}")
print(f" prec - CF: {precision_cf:.4f}")

In [14]:
!pip install "numpy<2"

Collecting numpy<2
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have

In [1]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp312-cp312-linux_x86_64.whl size=2611313 sha256=932f16851c155922ce5b9bd8dcb8e26878c2ad66a31ed748f9e3eb8223689178
  Stored in directory: /root/.cache

In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score
import warnings
warnings.filterwarnings('ignore')

# Установим необходимые библиотеки
try:
    import surprise
except ImportError:
    !pip install scikit-surprise
    import surprise

from surprise import Dataset, Reader, SVD, KNNBasic, accuracy
from surprise.model_selection import train_test_split

# Явно импортируем numpy и инициализируем для C-расширений


In [18]:
train_surprise_df = train_data[['userId', 'movieId']].copy()
train_surprise_df['rating'] = 1

test_surprise_df = test_data[['userId', 'movieId']].copy()
test_surprise_df['rating'] = 1

print(f"Surprise train size: {len(train_surprise_df)}")
print(f"Surprise test size: {len(test_surprise_df)}")


Surprise train size: 321824
Surprise test size: 241671


In [19]:
reader = Reader(rating_scale=(1, 1))  # у нас все рейтинги 1

# Создаем dataset из train_surprise_df
data = Dataset.load_from_df(train_surprise_df[['userId', 'movieId', 'rating']], reader)

# Разделяем на train и test внутри surprise
trainset = data.build_full_trainset()

# Обучаем модель SVD (как в примере с Kaggle)
print("Обучение модели SVD...")
algo_svd = SVD()
algo_svd.fit(trainset)

Обучение модели SVD...


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fa8e9a6e720>

In [21]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [23]:
def get_surprise_recommendations(algo, test_users, trainset, all_items, k=10):
    """Получает рекомендации с помощью алгоритма surprise"""
    recommendations = {}

    for user in test_users:
        # Получаем товары, которые пользователь уже покупал
        try:
            # Внутренний ID пользователя в trainset
            user_inner_id = trainset.to_inner_uid(user)
            # Товары, которые пользователь уже оценил (покупал)
            user_items = set([trainset.to_raw_iid(iid) for iid in trainset.ur[user_inner_id]])
        except ValueError:
            # Если пользователя нет в trainset
            user_items = set()

        # Предсказываем рейтинг для всех товаров, которые пользователь не покупал
        user_predictions = []
        for item in all_items:
            if item in user_items:
                continue  # Пропускаем уже купленные товары

            try:
                pred = algo.predict(user, item)
                user_predictions.append((item, pred.est))
            except Exception:
                continue

        # Сортируем по предсказанному рейтингу и берем топ-k
        user_predictions.sort(key=lambda x: x[1], reverse=True)
        top_k_items = [item for item, _ in user_predictions[:k]]
        recommendations[user] = top_k_items

    return recommendations

In [None]:
# Получаем список всех товаров
all_items = set(train_surprise_df['movieId'].unique()) | set(test_surprise_df['movieId'].unique())
test_users = test_surprise_df['userId'].unique()

print(f"Всего товаров: {len(all_items)}, пользователей в тесте: {len(test_users)}")

# Получаем рекомендации с помощью SVD
print("Получение рекомендаций SVD...")
svd_recommendations = get_surprise_recommendations(algo_svd, test_users, trainset, all_items, k)



# Оцениваем рекомендации для surprise
precision_svd, recall_svd, f1_svd = evaluate_recommendations(svd_recommendations, test_data, k)


print(f'Surprise SVD - Precision: {precision_svd:.4f}, Recall: {recall_svd:.4f}, F1: {f1_svd:.4f}')


# Альтернативный подход: использование тестового набора для оценки RMSE
# Создаем тестовый набор для surprise
testset = [(uid, iid, 1) for (uid, iid) in zip(test_surprise_df['userId'], test_surprise_df['movieId'])]

# Предсказания для тестового набора
svd_predictions = algo_svd.test(testset)


# Оценка RMSE
svd_rmse = accuracy.rmse(svd_predictions, verbose=False)


print(f'Surprise SVD RMSE: {svd_rmse:.4f}')


Всего товаров: 20964, пользователей в тесте: 34942
Получение рекомендаций SVD...
