# Connecting to mlflow

In [17]:
import mlflow

In [18]:
mlflow.set_tracking_uri("http://51.250.35.156:5000/")
mlflow.set_experiment("homework-saumnov.ext")

<Experiment: artifact_location='s3://mlflow/60', creation_time=1747480729078, experiment_id='60', last_update_time=1747480729078, lifecycle_stage='active', name='homework-saumnov.ext', tags={}>

# Load data

In [19]:
from datetime import timedelta
import polars as pl
import implicit

In [20]:
DATA_DIR = 'data/'

df_test_users = pl.read_parquet(f'{DATA_DIR}/test_users.pq')
df_clickstream = pl.read_parquet(f'{DATA_DIR}/clickstream.pq')

df_cat_features = pl.read_parquet(f'{DATA_DIR}/cat_features.pq')
df_text_features = pl.read_parquet(f'{DATA_DIR}/text_features.pq')
df_event = pl.read_parquet(f'{DATA_DIR}/events.pq')

# Prepare train eval

In [21]:
EVAL_DAYS_TRESHOLD = 14

In [22]:
treshhold = df_clickstream['event_date'].max() - timedelta(days=EVAL_DAYS_TRESHOLD)

In [23]:
df_train = df_clickstream.filter(df_clickstream['event_date']<= treshhold)
df_eval = df_clickstream.filter(df_clickstream['event_date']> treshhold)[['cookie', 'node', 'event']]

In [24]:
df_eval = df_eval.join(df_train, on=['cookie', 'node'], how='anti')

In [25]:
df_eval = df_eval.filter(
    pl.col('event').is_in(
        df_event.filter(pl.col('is_contact')==1)['event'].unique()
    )
)

In [26]:
df_eval = df_eval.filter(
        pl.col('cookie').is_in(df_train['cookie'].unique())
    ).filter(
        pl.col('node').is_in(df_train['node'].unique())
    )

In [27]:
df_eval = df_eval.unique(['cookie', 'node'])

# Подсчет метрики

In [28]:
def recall_at(df_true, df_pred, k=40):
    return  df_true[['node', 'cookie']].join(
        df_pred.group_by('cookie').head(k).with_columns(value=1)[['node', 'cookie', 'value']], 
        how='left',
        on = ['cookie', 'node']
    ).select(
        [pl.col('value').fill_null(0), 'cookie']
    ).group_by(
        'cookie'
    ).agg(
        [
            pl.col('value').sum()/pl.col(
                'value'
            ).count()
        ]
    )['value'].mean()


# Проведение экспериментов с разными моделями

## ALS

In [45]:
from scipy.sparse import csr_matrix
import numpy as np
import implicit
import optuna

In [34]:
def get_als_pred(users,
                 nodes,
                 user_to_pred,
                 factors=60,
                 iterations=10,
                 regularization=0.01,
                 alpha=15,
                 random_state=42):
    user_ids = users.unique().to_list()
    item_ids = nodes.unique().to_list()
        
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
    item_id_to_index = {item_id: idx for idx, item_id in enumerate(item_ids)}
    index_to_item_id = {v:k for k,v in item_id_to_index.items()}
    
    rows = users.replace_strict(user_id_to_index).to_list()
    cols = nodes.replace_strict(item_id_to_index).to_list()
    
    values = [1] * len(users)
    
    sparse_matrix = csr_matrix((values, (rows, cols)), shape=(len(user_ids), len(item_ids)))
    
    model = implicit.als.AlternatingLeastSquares(factors=factors,
                                                 iterations=iterations,
                                                 regularization=regularization,
                                                 alpha=alpha,
                                                 random_state=random_state)
    model.fit(sparse_matrix, )
    
    
    user4pred = np.array([user_id_to_index[i] for i in user_to_pred])
    
    recommendations, scores = model.recommend(user4pred, sparse_matrix[user4pred], N=40, filter_already_liked_items=True)
    
    df_pred = pl.DataFrame(
        {
            'node': [
                [index_to_item_id[i] for i in i] for i in recommendations.tolist()
            ], 
             'cookie': list(user_to_pred),
            'scores': scores.tolist()
            
        }
    )
    df_pred = df_pred.explode(['node', 'scores'])
    return df_pred

In [42]:
def als_optuna_mlflow_log(trial):
    with mlflow.start_run(run_name=f'als_optuna_trial_{trial.number}'):
        factors = trial.suggest_int('factors', 20, 200)
        regularization = trial.suggest_float('regularization', 1e-4, 1)
        iterations = trial.suggest_int('iterations', 5, 50)
        alpha = trial.suggest_float('alpha', 1, 40)
        random_state = 42

        

        users = df_train['cookie']
        nodes = df_train['node']
        eval_users = df_eval['cookie'].unique().to_list()
        df_pred = get_als_pred(users,
                               nodes,
                               eval_users,
                               factors=factors,
                               iterations=iterations,
                               regularization=regularization,
                               alpha=alpha,
                               random_state=random_state)

        recall_40 = recall_at(df_eval, df_pred, k=40)

        mlflow.log_metric('Recall_40', recall_40)
        mlflow.log_param('als_factors', factors)
        mlflow.log_param('als_regularization', regularization)  
        mlflow.log_param('als_iterations', iterations)  
        mlflow.log_param('als_alpha', alpha)
        mlflow.log_param('als_random_state', random_state)

        mlflow.log_param('eval_days_treshold', EVAL_DAYS_TRESHOLD)
        mlflow.log_param('model_type', 'als')
        mlflow.log_param("train_size", len(df_train))
        mlflow.log_param("eval_size", len(df_eval))
    
        return recall_40

In [43]:
study = optuna.create_study(direction="maximize")
study.optimize(als_optuna_mlflow_log, n_trials=5)

[I 2025-05-17 15:10:53,859] A new study created in memory with name: no-name-3253031e-9658-4eef-b3e4-5a4d3f078296
100%|██████████| 28/28 [07:28<00:00, 16.01s/it]


🏃 View run als_optuna_trial_0 at: http://51.250.35.156:5000/#/experiments/60/runs/d3d6fc91ca79480ea90e6b1a519efa7f
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/60


[I 2025-05-17 15:19:11,120] Trial 0 finished with value: 0.15075733334123784 and parameters: {'factors': 187, 'regularization': 0.18612568999929963, 'iterations': 28, 'alpha': 21.809302283908075}. Best is trial 0 with value: 0.15075733334123784.
100%|██████████| 42/42 [05:29<00:00,  7.85s/it]


🏃 View run als_optuna_trial_1 at: http://51.250.35.156:5000/#/experiments/60/runs/63f1ac9de7fa4cad8549dc77c3830bd6
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/60


[I 2025-05-17 15:25:56,597] Trial 1 finished with value: 0.1548442179744957 and parameters: {'factors': 133, 'regularization': 0.8229253039930486, 'iterations': 42, 'alpha': 4.798793036980507}. Best is trial 1 with value: 0.1548442179744957.
100%|██████████| 44/44 [05:37<00:00,  7.66s/it]


🏃 View run als_optuna_trial_2 at: http://51.250.35.156:5000/#/experiments/60/runs/9cb85ec94b5748f7b922d3d4794f7697
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/60


[I 2025-05-17 15:32:18,674] Trial 2 finished with value: 0.14520289088407642 and parameters: {'factors': 130, 'regularization': 0.5318149698356839, 'iterations': 44, 'alpha': 34.856096302533395}. Best is trial 1 with value: 0.1548442179744957.
100%|██████████| 37/37 [09:17<00:00, 15.06s/it]


🏃 View run als_optuna_trial_3 at: http://51.250.35.156:5000/#/experiments/60/runs/83d173bad8864831b519d6f609d2e4e0
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/60


[I 2025-05-17 15:42:24,244] Trial 3 finished with value: 0.14629017796373628 and parameters: {'factors': 181, 'regularization': 0.5180131772067913, 'iterations': 37, 'alpha': 33.16937899619416}. Best is trial 1 with value: 0.1548442179744957.
100%|██████████| 41/41 [09:21<00:00, 13.70s/it]


🏃 View run als_optuna_trial_4 at: http://51.250.35.156:5000/#/experiments/60/runs/c354503a18464e41bba91d083fe1b234
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/60


[I 2025-05-17 15:52:31,512] Trial 4 finished with value: 0.14369405675050986 and parameters: {'factors': 171, 'regularization': 0.24253951590180067, 'iterations': 41, 'alpha': 37.37736328147678}. Best is trial 1 with value: 0.1548442179744957.


## Popular

In [46]:
def get_popular(df):
    popukar_node = df.group_by('node').agg(pl.col('cookie').count()).sort('cookie').tail(40)['node'].to_list()
    df_pred_pop = pl.DataFrame({'node': [popukar_node for i in range(len(eval_users))], 'cookie': eval_users})
    df_pred_pop = df_pred_pop.explode('node')
    return df_pred_pop


In [48]:
with mlflow.start_run(run_name='popular'):

    users = df_train["cookie"]
    nodes = df_train["node"]
    eval_users = df_eval['cookie'].unique().to_list()

    train_pop = get_popular(df_train)
    recall_40 = recall_at(df_eval, train_pop, k=40)
    mlflow.log_metric('Recall_40', recall_40)
    mlflow.log_param('model_type', 'popular')
    mlflow.log_param('eval_days_treshold', EVAL_DAYS_TRESHOLD)
    mlflow.log_param("train_size", len(df_train))
    mlflow.log_param("eval_size", len(df_eval))

🏃 View run popular at: http://51.250.35.156:5000/#/experiments/60/runs/6fc1a333eb0c4213bd213878437e9ad4
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/60
