In [24]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

print("Начинаю загрузку данных...")
train = pd.read_csv("train.csv", sep=',', quotechar='"')
test = pd.read_csv("test.csv", sep=',', quotechar='"')
books = pd.read_csv("books.csv", sep=',', quotechar='"', on_bad_lines='skip')
users = pd.read_csv("users.csv", sep=',', quotechar='"')
genres = pd.read_csv("genres.csv", sep=',', quotechar='"')
book_genres = pd.read_csv("book_genres.csv", sep=',', quotechar='"')
print("Данные загружены")

print("Обрабатываю данные...")
train = train[train['has_read'] == 1].copy()

print("Создаю признаки пользователей...")
user_stats = train.groupby('user_id').agg({
    'rating': ['mean', 'std', 'count', 'min', 'max'],
    'book_id': 'nunique'
}).reset_index()
user_stats.columns = ['user_id', 'user_rating_mean', 'user_rating_std', 'user_rating_count', 'user_rating_min', 'user_rating_max', 'user_unique_books']
user_stats['user_rating_std'] = user_stats['user_rating_std'].fillna(0)

print("Создаю признаки книг...")
book_stats = train.groupby('book_id').agg({
    'rating': ['mean', 'std', 'count', 'min', 'max'],
    'user_id': 'nunique'
}).reset_index()
book_stats.columns = ['book_id', 'book_rating_mean', 'book_rating_std', 'book_rating_count', 'book_rating_min', 'book_rating_max', 'book_unique_users']
book_stats['book_rating_std'] = book_stats['book_rating_std'].fillna(0)

print("Объединяю данные...")
train = train.merge(books, on='book_id', how='left')

print("Создаю признаки авторов...")
author_stats = train.groupby('author_id').agg({
    'rating': ['mean', 'std', 'count']
}).reset_index()
author_stats.columns = ['author_id', 'author_rating_mean', 'author_rating_std', 'author_rating_count']
author_stats['author_rating_std'] = author_stats['author_rating_std'].fillna(0)

user_time_stats = pd.DataFrame(columns=['user_id', 'user_first_read', 'user_last_read', 'user_reading_span'])

train = train.merge(user_stats, on='user_id', how='left')
train = train.merge(book_stats, on='book_id', how='left')
train = train.merge(author_stats, on='author_id', how='left')
train = train.merge(user_time_stats, on='user_id', how='left')
train = train.merge(users, on='user_id', how='left')

test = test.merge(user_stats, on='user_id', how='left')
test = test.merge(book_stats, on='book_id', how='left')
test = test.merge(books, on='book_id', how='left')
test = test.merge(author_stats, on='author_id', how='left')
test = test.merge(user_time_stats, on='user_id', how='left')
test = test.merge(users, on='user_id', how='left')

book_genres_count = book_genres.groupby('book_id').size().reset_index(name='book_genres_count')
train = train.merge(book_genres_count, on='book_id', how='left')
test = test.merge(book_genres_count, on='book_id', how='left')

train['days_since_first_read'] = 0
train['days_since_last_read'] = 0
test['days_since_first_read'] = 0
test['days_since_last_read'] = 0

train['user_book_intersection'] = train['user_rating_mean'] * train['book_rating_mean']
train['user_author_intersection'] = train['user_rating_mean'] * train['author_rating_mean']
test['user_book_intersection'] = test['user_rating_mean'] * test['book_rating_mean']
test['user_author_intersection'] = test['user_rating_mean'] * test['author_rating_mean']

current_year = 2025
train['book_age'] = current_year - train['publication_year']
test['book_age'] = current_year - test['publication_year']

feature_cols = [
    'user_rating_mean', 'user_rating_std', 'user_rating_count', 'user_rating_min', 'user_rating_max', 'user_unique_books',
    'book_rating_mean', 'book_rating_std', 'book_rating_count', 'book_rating_min', 'book_rating_max', 'book_unique_users',
    'author_rating_mean', 'author_rating_std', 'author_rating_count',
    'user_reading_span', 'days_since_first_read', 'days_since_last_read',
    'user_book_intersection', 'user_author_intersection',
    'book_age', 'avg_rating', 'language', 'publisher',
    'gender', 'age', 'book_genres_count'
]

X = train[feature_cols].copy()
y = train['rating'].copy()
X_test = test[feature_cols].copy()

for col in ['user_rating_std', 'book_rating_std', 'author_rating_std', 'book_genres_count']:
    X[col] = X[col].fillna(0)
    X_test[col] = X_test[col].fillna(0)

for col in feature_cols:
    if X[col].isnull().any():
        X[col] = X[col].fillna(X[col].median())
        X_test[col] = X_test[col].fillna(X[col].median())

cat_features = ['language', 'publisher', 'gender']
cat_idx = [X.columns.get_loc(col) for col in cat_features]

print("Начинаю обучение моделей...")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = []
test_predictions = []

for i, (train_index, val_index) in enumerate(kf.split(X)):
    print(f"Обучение модели {i+1}/5...")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    model = CatBoostRegressor(
        iterations=1000,
        depth=8,
        learning_rate=0.05,
        loss_function='RMSE',
        random_state=42,
        verbose=False,
        early_stopping_rounds=50
    )
    
    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        cat_features=cat_idx,
        verbose=False
    )
    
    val_preds = model.predict(X_val)
    mse = mean_squared_error(y_val, val_preds)
    mse_scores.append(mse)
    
    test_pred = model.predict(X_test)
    test_predictions.append(test_pred)
    print(f"Модель {i+1} обучена, MSE: {mse:.4f}")

print("Создаю предсказания...")
final_test_predictions = np.mean(test_predictions, axis=0)
final_test_predictions = np.clip(final_test_predictions, 0, 10)

submission = pd.DataFrame({
    'user_id': test['user_id'],
    'book_id': test['book_id'],
    'rating_predict': final_test_predictions
})

submission.to_csv('submission.csv', index=False)
print("Файл submission.csv создан")

rmse_score = np.mean(np.sqrt(mse_scores))
print(f"RMSE: {rmse_score}")

Начинаю загрузку данных...
Данные загружены
Обрабатываю данные...
Создаю признаки пользователей...
Создаю признаки книг...
Объединяю данные...
Создаю признаки авторов...
Начинаю обучение моделей...
Обучение модели 1/5...
Модель 1 обучена, MSE: 4.2240
Обучение модели 2/5...
Модель 2 обучена, MSE: 4.2091
Обучение модели 3/5...
Модель 3 обучена, MSE: 4.1528
Обучение модели 4/5...
Модель 4 обучена, MSE: 4.2306
Обучение модели 5/5...
Модель 5 обучена, MSE: 4.0776
Создаю предсказания...
Файл submission.csv создан
RMSE: 2.044161851308357


In [25]:
import os
print("Текущая папка:", os.getcwd())
print("Файлы здесь:", os.listdir('.'))

Текущая папка: C:\Users\Ivan
Файлы здесь: ['-1.14-windows.xml', '.anaconda', '.cache', '.conda', '.condarc', '.continuum', '.insomniac', '.ipynb_checkpoints', '.ipython', '.jupyter', '.VirtualBox', '3D Objects', 'anaconda3', 'anaconda_projects', 'ansel', 'AppData', 'Application Data', 'baseline (1).ipynb', 'books.csv', 'book_descriptions.csv', 'book_genres.csv', 'catboost_info', 'Contacts', 'Cookies', 'Desktop', 'Doctor Web', 'Documents', 'Downloads', 'Favorites', 'genres.csv', 'Links', 'Local Settings', 'Music', 'NetHood', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{53b39e88-18c4-11ea-a811-000d3aa4692b}.TM.blf', 'NTUSER.DAT{53b39e88-18c4-11ea-a811-000d3aa4692b}.TMContainer00000000000000000001.regtrans-ms', 'NTUSER.DAT{53b39e88-18c4-11ea-a811-000d3aa4692b}.TMContainer00000000000000000002.regtrans-ms', 'ntuser.ini', 'OneDrive', 'Oracle', 'Pictures', 'PrintHood', 'Recent', 'sample_submission.csv', 'Saved Games', 'Searches', 'SendTo', 'setup.dat', 'submission.csv', 'te