In [51]:
# Import necessary libraries
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool

# Load data
events = pd.read_csv("events.csv", index_col=0)     # merge of additional rates with original ones
user_features = pd.read_csv('user_features_our.csv')      # User features with 'user_id' column
item_features = pd.read_csv('item_features_our.csv')      # Item features with 'item_id' column
user_item_scores = pd.read_csv('data_new.csv')  # Interaction data with 'user_id', 'item_id', 'score'
# user_item_scores2 = pd.read_csv('sample.csv', index_col = 0)

# Example of merging on multiple columns
data = pd.merge(events, user_item_scores, on=['user_id', 'item_id'], how='left')

# Merge user features with interaction data
data = pd.merge(data, user_features, on='user_id', how='left')

# Merge item features
data = pd.merge(data, item_features, on='item_id', how='left')

def accumulate(s):
    out = [[]]
    for x in s:
        out.append(out[-1]+[x])
    return out[:-1]


data = data.sort_values(by=['user_id', 'timestamp'])
data['cumulative_items'] = data.groupby('user_id')['item_id'].transform(accumulate)
data["len_history"] = data["cumulative_items"].apply(lambda x: len(x))
data = data[data.len_history >= 4]
data = data.drop(columns=['cumulative_items', 'len_history'])

# display(data)
# Prepare training data
X = data.drop(columns=['user_id', 'item_id', 'rating', 'timestamp'])
y = data['rating']

# Identify categorical features
categorical_features_indices = np.where(X.dtypes == 'object')[0]

# Split data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create Pool objects for CatBoost
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)

# Define the parameter grid for grid search
grid = {
    'iterations': [500, 1000],
    'learning_rate': [0.01, 0.05],
    'depth': [6, 8],
    'l2_leaf_reg': [1, 3, 5]
}

# Initialize CatBoostRegressor
model = CatBoostRegressor(
    loss_function='RMSE',
    eval_metric='RMSE',
    random_seed=42,
    early_stopping_rounds=50
)

# Perform grid search
grid_search_result = model.grid_search(
    grid,
    X=train_pool,
    cv=3,
    partition_random_seed=42,
    refit=True,
    verbose=2
)

print(grid_search_result['cv_results'])


0:	learn: 3.7182932	test: 3.7139673	best: 3.7139673 (0)	total: 55ms	remaining: 27.4s
1:	learn: 3.6838592	test: 3.6795640	best: 3.6795640 (1)	total: 114ms	remaining: 28.3s
2:	learn: 3.6497863	test: 3.6455068	best: 3.6455068 (2)	total: 173ms	remaining: 28.7s
3:	learn: 3.6161161	test: 3.6118453	best: 3.6118453 (3)	total: 225ms	remaining: 27.8s
4:	learn: 3.5828192	test: 3.5785539	best: 3.5785539 (4)	total: 279ms	remaining: 27.6s
5:	learn: 3.5498372	test: 3.5455969	best: 3.5455969 (5)	total: 337ms	remaining: 27.8s
6:	learn: 3.5175010	test: 3.5132737	best: 3.5132737 (6)	total: 393ms	remaining: 27.7s
7:	learn: 3.4852510	test: 3.4810371	best: 3.4810371 (7)	total: 458ms	remaining: 28.2s
8:	learn: 3.4533055	test: 3.4491019	best: 3.4491019 (8)	total: 525ms	remaining: 28.6s
9:	learn: 3.4217277	test: 3.4175436	best: 3.4175436 (9)	total: 589ms	remaining: 28.9s
10:	learn: 3.3905075	test: 3.3863436	best: 3.3863436 (10)	total: 649ms	remaining: 28.9s
11:	learn: 3.3595814	test: 3.3554401	best: 3.3554401 

In [52]:
for column in events.columns:
    print(column)

item_id
rating
timestamp


In [53]:
# Prepare data for prediction (as before)
# Get unique users and items
users = user_features['user_id'].unique()
items = item_features['item_id'].unique()


# Create a DataFrame of all possible user-item pairs
user_item_pairs = pd.MultiIndex.from_product(
    [users, items], names=['user_id', 'item_id']
).to_frame(index=False)

# Remove user-item pairs that already exist in interaction data
existing_pairs = user_item_scores[['user_id', 'item_id']]
new_user_item_pairs = pd.merge(
    user_item_pairs,
    existing_pairs,
    on=['user_id', 'item_id'],
    how='left',
    indicator=True
)
new_user_item_pairs = new_user_item_pairs[new_user_item_pairs['_merge'] == 'left_only']
new_user_item_pairs = new_user_item_pairs.drop(columns=['_merge'])

# Merge with user and item features
new_data = pd.merge(new_user_item_pairs, user_features, on='user_id', how='left')
new_data = pd.merge(new_data, item_features, on='item_id', how='left')
new_data = pd.merge(new_data, user_item_scores, on=['item_id', 'user_id'], how='left')

# Prepare prediction data
X_new = new_data.drop(columns=['user_id', 'item_id'])

# Ensure the same order of features and categorical features indices
X_new = X_new[X_train.columns]

# Predict scores using the tuned model
new_data['predicted_score'] = model.predict(X_new)

# For each user, recommend top 10 items based on predicted scores
recommendations = new_data.groupby('user_id').apply(
    lambda x: x.nlargest(10, 'predicted_score')
).reset_index(drop=True)

# Display the recommendations
print(recommendations[['user_id', 'item_id', 'predicted_score']])

# Optionally, save recommendations to a CSV file
recommendations[['user_id', 'item_id', 'predicted_score']].to_csv('recommendations.csv', index=False)

       user_id  item_id  predicted_score
0            0      586         1.826953
1            0     3491         1.826953
2            0      192         1.812326
3            0      327         1.812326
4            0      404         1.812326
...        ...      ...              ...
60395     6039     1244         1.462865
60396     6039     1522         1.462865
60397     6039     1566         1.462865
60398     6039     1620         1.462865
60399     6039     1874         1.462865

[60400 rows x 3 columns]


  recommendations = new_data.groupby('user_id').apply(


In [54]:
result = recommendations.groupby('user_id').aggregate({
    'item_id': lambda x: " ".join(x.astype(str))
})
display(result)
result.to_csv('recommendations.csv', index=True)

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
0,586 3491 192 327 404 608 775 1244 1522 1566
1,192 327 404 608 775 1244 1522 1566 1620 1874
2,2736 586 3491 192 327 404 608 775 1244 1522
3,192 327 404 608 775 1244 1522 1566 1620 1874
4,192 327 404 608 775 1244 1522 1566 1620 1874
...,...
6035,192 327 404 608 775 1244 1522 1566 1620 1874
6036,586 3491 192 327 404 608 775 1244 1522 1566
6037,586 3491 2736 192 327 404 608 775 1244 1522
6038,192 327 404 608 775 1244 1522 1566 1620 1874
