<a href="https://colab.research.google.com/github/yeyevtushenko/AI/blob/Lesson13.05.2024/Lesson13_05_2024_AI_H_22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import os
import contextlib

In [2]:
try:
    from surprise import Dataset, SVD, Reader, accuracy
except:
    !pip install -q surprise
    from surprise import Dataset, SVD, Reader, accuracy

from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise import BaselineOnly, SVD, KNNBasic, CoClustering

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/154.4 kB[0m [31m629.0 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/154.4 kB[0m [31m784.0 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m122.9/154.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone


In [3]:
try:
    import optuna
except:
    !pip install -q optuna
    import optuna

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/HalyshAnton/IT-Step-Pyton-AI/main/module7/data/ratings.csv")

In [5]:
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,172,94969,5.0,1396067836
1,172,98956,4.0,1396067879
2,176,73881,4.0,1499807147
3,221,1900,4.5,1288550866
4,333,33688,4.0,1412015122


In [6]:
df.describe()

Unnamed: 0,user_id,movie_id,rating,timestamp
count,17604.0,17604.0,17604.0,17604.0
mean,141251.609009,88913.605942,3.465945,1403256000.0
std,81731.882435,51285.238375,1.203144,151508300.0
min,172.0,139.0,0.5,828222400.0
25%,71256.0,55098.0,3.0,1395695000.0
50%,138487.0,89554.0,3.5,1456552000.0
75%,210264.5,134779.5,4.5,1494418000.0
max,283195.0,193843.0,5.0,1537934000.0


In [7]:
df['rating'].describe()

count    17604.000000
mean         3.465945
std          1.203144
min          0.500000
25%          3.000000
50%          3.500000
75%          4.500000
max          5.000000
Name: rating, dtype: float64

In [8]:
reader = Reader(rating_scale=(0, 5))

data = Dataset.load_from_df(df[["user_id", "movie_id", "rating"]], reader)

In [9]:
trainset, testset = train_test_split(data, train_size=0.8)

In [10]:
def objective(trial):
    algorithm = trial.suggest_categorical('algorithm', ['KNNBasic', 'SVD', 'BaselineOnly', 'CoClustering'])

    if algorithm == 'KNNBasic':
        k = trial.suggest_int('k', 10, 40)
        min_k = trial.suggest_int('min_k', 1, 10)
        sim_options = {'name': trial.suggest_categorical('similarity', ['cosine', 'msd', 'pearson']),
                       'user_based': trial.suggest_categorical('user_based', [True, False])}
        algo = KNNBasic(k=k, min_k=min_k, sim_options=sim_options)
    elif algorithm == 'SVD':
        n_factors = trial.suggest_int('n_factors', 50, 200)
        n_epochs = trial.suggest_int('n_epochs', 20, 100)
        algo = SVD(n_factors=n_factors, n_epochs=n_epochs)
    elif algorithm == 'BaselineOnly':
        bsl_options = {'method': trial.suggest_categorical('method', ['als', 'sgd']),
                       'reg_u': trial.suggest_loguniform('reg_u', 1e-6, 1e-2),
                       'reg_i': trial.suggest_loguniform('reg_i', 1e-6, 1e-2)}
        algo = BaselineOnly(bsl_options=bsl_options)
    else:
        algo = CoClustering(n_cltr_u=trial.suggest_int('n_cltr_u', 3, 10),
                            n_cltr_i=trial.suggest_int('n_cltr_i', 3, 10))

    results = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)
    rmse_mean = results['test_rmse'].mean()

    return rmse_mean

In [11]:
study = optuna.create_study(direction='minimize', storage=None, load_if_exists=True)


with open(os.devnull, 'w') as devnull:
    with contextlib.redirect_stdout(devnull):
        study.optimize(objective, n_trials=100)

[I 2024-05-22 06:15:44,013] A new study created in memory with name: no-name-10aacda3-6183-4331-9ad3-0e8b6963677a
[I 2024-05-22 06:15:50,196] Trial 0 finished with value: 1.178445317142472 and parameters: {'algorithm': 'KNNBasic', 'k': 30, 'min_k': 8, 'similarity': 'cosine', 'user_based': True}. Best is trial 0 with value: 1.178445317142472.
[I 2024-05-22 06:15:57,454] Trial 1 finished with value: 0.9706626394043433 and parameters: {'algorithm': 'SVD', 'n_factors': 146, 'n_epochs': 81}. Best is trial 1 with value: 0.9706626394043433.
[I 2024-05-22 06:16:00,947] Trial 2 finished with value: 0.9815211867430597 and parameters: {'algorithm': 'SVD', 'n_factors': 184, 'n_epochs': 52}. Best is trial 1 with value: 0.9706626394043433.
[I 2024-05-22 06:16:03,776] Trial 3 finished with value: 0.9681354247939034 and parameters: {'algorithm': 'SVD', 'n_factors': 72, 'n_epochs': 98}. Best is trial 3 with value: 0.9681354247939034.
[I 2024-05-22 06:16:09,039] Trial 4 finished with value: 1.1850726094

In [12]:
best_params = study.best_params
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'algorithm': 'BaselineOnly', 'method': 'als', 'reg_u': 0.0003496807372657765, 'reg_i': 7.80320983995176e-05}
