# Ноутбук с тестированиями моделей, подбором гиперпараметров и версионированием в ML Flow

In [187]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, recall_score, precision_score, precision_recall_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import xgboost as xgb
from catboost import CatBoostClassifier

In [188]:
import mlflow

In [189]:
min_max_scaler = MinMaxScaler()
standart_scaler = StandardScaler()

## Импорт датасета и его преобразование

In [190]:
df = pd.read_csv("data_win_prediction.csv",sep=";")

In [191]:
df.Team_A_avg_win_percentage = df.Team_A_avg_win_percentage.str.replace(",",".").astype(float)
df.Team_B_avg_win_percentage = df.Team_B_avg_win_percentage.str.replace(",",".").astype(float)
df.Team_A_avg_KR = df.Team_A_avg_KR.str.replace(",",".").astype(float)
df.Team_A_avg_elo = df.Team_A_avg_elo.str.replace(",",".").astype(float)
df.Team_B_avg_KR = df.Team_B_avg_KR.str.replace(",",".").astype(float)

In [192]:
df.shape

(1465, 9)

In [193]:
values_to_remove = ['aim_map', 'awp_india', 'aim_crashz_dust_1on1', 'de_ravine', 'de_foroglio', 'awp_orange', 'awp_lego_2', 'aim_redline','dorf']
df = df[~df['map'].isin(values_to_remove)]

In [194]:
df = df.drop(columns=["map","\tMatch ID"])

In [195]:
df.head()

Unnamed: 0,win,Team_A_avg_win_percentage,Team_A_avg_KR,Team_A_avg_elo,Team_B_avg_win_percentage,Team_B_avg_KR,Team_B_avg_elo
0,team a,58.864865,0.726308,1720.4,47.289216,0.792615,1536
1,team a,52.341629,0.764807,1989.6,54.823232,0.762641,2073
2,team b,20.20202,0.77303,753.8,66.095238,0.722705,158
3,team b,48.414652,0.733409,2565.0,62.631108,0.838246,2219
4,team b,43.571196,0.709982,1484.4,55.513072,0.768946,1352


In [196]:
df.shape

(1452, 7)

In [197]:
df.win = df.win.map({"team a":0, "team b":1})

## Разбиваем датасет на признаки и метки классов

In [198]:
X, y = df.drop(columns=["win"]), df.win

In [199]:
columns = X.columns.to_list()

## Разделение данных обучающую и тестовую выборку

### Без MinMaxScaler и без StandartScaler

In [200]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### С MinMaxScaler-ом

In [201]:
X_mms_scaled = min_max_scaler.fit_transform(X)

In [202]:
X_mms_train, X_mms_test, y_mms_train, y_mms_test = train_test_split(
    pd.DataFrame(X_mms_scaled, columns=columns), y, test_size=0.33, random_state=42)

### С StandartScaler-ом

In [203]:
X_standart_scaled = standart_scaler.fit_transform(X)

In [204]:
X_ss_train, X_ss_test, y_ss_train, y_ss_test = train_test_split(
    pd.DataFrame(X_standart_scaled, columns=columns), y, test_size=0.33, random_state=42)

### Что лучше?

In [205]:
for method in ["None", "Standart", "MinMax"]:
    model = LogisticRegression(max_iter=1000, random_state=42)
    f1 = 0.0
    if method == "None":
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
    elif method == "Standart":
        model.fit(X_ss_train, y_ss_train)
        y_pred = model.predict(X_ss_test)
        f1 = f1_score(y_ss_test, y_pred)
    elif method == "MinMax":
        model.fit(X_mms_train, y_mms_train)
        y_pred = model.predict(X_mms_test)
        f1 = f1_score(y_mms_test, y_pred)
    print(f1)

0.8052930056710775
0.8052930056710775
0.799249530956848


Видим, что лучше использовать StandartScaler, либо вообще не преобразовывать данные. Но лучше все же использовать StandartScaler, что я в дальнейшем и буду делать.

## Обучение моделей и выбор лучших

Настало время обучения. Проверять буду следующие модели: "мою эврестическую модель", логистическую регрессию, XGBoost и простую нейросеть.

Для всех моделей я буду искать гипер-параметры. Их можно искать различными способами: GridSearch, RandomSearch, Optuna, Hyperopt (Я простирую их все).

### Эврестическая модель

Данные имеют следующий вид:

In [19]:
df.head(3)

Unnamed: 0,win,Team_A_avg_win_percentage,Team_A_avg_KR,Team_A_avg_elo,Team_B_avg_win_percentage,Team_B_avg_KR,Team_B_avg_elo
0,0,58.864865,0.726308,1720.4,47.289216,0.792615,1536
1,0,52.341629,0.764807,1989.6,54.823232,0.762641,2073
2,1,20.20202,0.77303,753.8,66.095238,0.722705,158


Зачем обучать модель, если можно взглянуть на данные и выявить победителя из тех, у кого больше среднее эло и больше кд? Зачем здесь ML?

In [166]:
class HeuristicsModel:
    def __init__(self, coincidence: int = 2) -> None:
        """
        Эвристическая модель
        coincidence - это количество совпадений из следующих условий:
            1. Team_A_avg_win_percentage > Team_B_avg_win_percentage
            2. Team_A_avg_KR > Team_B_avg_KR
            3. Team_A_avg_elo > Team_B_avg_elo
        """
        if coincidence in [0, 1, 2, 3]:
            self.coincidence = coincidence
        else:
            raise InvalidParameterError("coincidence должен быть в диапазоне [0, 3] - от нуля до трех включительно!")

    def predict(self, data: np.ndarray | pd.DataFrame) -> np.ndarray:
        pred = np.empty(data.shape[0])
        index = 0
        df = data.copy()
        if isinstance(data, pd.DataFrame):
            df = data.to_numpy()
        for object_ in df:
            data_coincidence = 0
            if object_[0] > object_[3]:
                data_coincidence += 1
            if object_[1] > object_[4]:
                data_coincidence += 1
            if object_[2] > object_[5]:
                data_coincidence += 1
            if data_coincidence >= self.coincidence:
                pred[index] = 1
            else:
                pred[index] = 0
            index += 1
        return pred

In [167]:
for c in [0, 1, 2, 3]:
    hm = HeuristicsModel(coincidence=c)
    hm_pred = hm.predict(X_ss_test)
    f1_hm_pred = f1_score(y_ss_test, hm_pred)
    print(f"f1 для coincidence равной {c}: ",f1_hm_pred)

f1 для coincidence равной 0:  0.6938775510204082
f1 для coincidence равной 1:  0.5522620904836193
f1 для coincidence равной 2:  0.30353430353430355
f1 для coincidence равной 3:  0.0729483282674772


### Логистическая регрессия

LogisticRegression - одна из простых и хороших моделей.

In [20]:
logistic_regression = LogisticRegression(
    solver='lbfgs',
    tol=1e-6,
    max_iter=1000
)

In [21]:
log_regression_param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

#### GridSearchCV

In [22]:
grid_search_log_regression = GridSearchCV(
    estimator=logistic_regression, 
    param_grid=log_regression_param_grid,
    scoring="f1",
    cv=3,
    n_jobs=-1,
    verbose=1
)

In [23]:
grid_search_log_regression.fit(X_ss_train, y_ss_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [24]:
grid_search_params = grid_search_log_regression.best_params_
print(grid_search_params)
print(grid_search_log_regression.best_score_)

{'C': 0.1, 'class_weight': None, 'penalty': 'l1', 'solver': 'saga'}
0.8106130502121037


#### RandomizedSearchCV

In [25]:
randomized_searcg_log_regression = RandomizedSearchCV(
    estimator=logistic_regression, 
    param_distributions=log_regression_param_grid,
    scoring="f1",
    cv=3,
    n_jobs=-1,
    verbose=1
)

In [26]:
randomized_searcg_log_regression.fit(X_ss_train, y_ss_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [27]:
randomized_search_params = randomized_searcg_log_regression.best_params_
print(randomized_search_params)
print(randomized_searcg_log_regression.best_score_)

{'solver': 'saga', 'penalty': 'l1', 'class_weight': None, 'C': 0.1}
0.8106130502121037


#### Hyperopt

In [28]:
space = {
    'C': hp.loguniform('C', np.log(0.001), np.log(100)),
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'solver': hp.choice('solver', ['liblinear', 'saga']),
    'class_weight': hp.choice('class_weight', [None, 'balanced']),
}

In [29]:
def objective(params):
    try:
        model = LogisticRegression(
            C=params['C'],
            penalty=params['penalty'],
            solver=params['solver'],
            class_weight=params['class_weight'],
            random_state=42,
            max_iter=3000
        )
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        
        return {'loss': -f1, 'status': STATUS_OK}
        
    except Exception as e:
        print(f"Error: {e}")
        return {'loss': 1.0, 'status': STATUS_OK}

In [30]:
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

100%|█████████████████████████████████████████████| 100/100 [00:10<00:00,  9.78trial/s, best loss: -0.8127340823970037]


In [31]:
best

{'C': 0.0027740713874680476, 'class_weight': 0, 'penalty': 0, 'solver': 0}

In [32]:
hyperopt_params = {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'saga'}

#### Optuna

In [35]:
import optuna

def objective_optuna(trial):
    params = {
        'C': trial.suggest_float('C', 0.001, 100, log=True),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
        'class_weight': trial.suggest_categorical("class_weight", [None, 'balanced']),
        'max_iter': 1000,
        'random_state': 42
    }
    
    model = LogisticRegression(**params)
    model.fit(X_ss_train, y_ss_train)
    
    y_pred = model.predict(X_ss_test)
    f1 = f1_score(y_ss_test, y_pred)
    
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective_optuna, n_trials=50, show_progress_bar=True)

print("Best parameters:", study.best_params)
print("Best F1-score:", study.best_value)

[I 2025-11-03 19:19:59,595] A new study created in memory with name: no-name-3312fd21-7a75-42e6-9707-01dbe2614d9f


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-03 19:19:59,636] Trial 0 finished with value: 0.7928994082840237 and parameters: {'C': 0.21605123311770782, 'penalty': 'l1', 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.7928994082840237.
[I 2025-11-03 19:19:59,655] Trial 1 finished with value: 0.7913385826771654 and parameters: {'C': 11.76980824772972, 'penalty': 'l2', 'solver': 'saga', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.7928994082840237.
[I 2025-11-03 19:19:59,676] Trial 2 finished with value: 0.79296875 and parameters: {'C': 0.009374575013063567, 'penalty': 'l1', 'solver': 'saga', 'class_weight': 'balanced'}. Best is trial 2 with value: 0.79296875.
[I 2025-11-03 19:19:59,699] Trial 3 finished with value: 0.7913385826771654 and parameters: {'C': 48.70609946389046, 'penalty': 'l1', 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 2 with value: 0.79296875.
[I 2025-11-03 19:19:59,728] Trial 4 finished with value: 0.7913385826771654 and parameters: {

In [34]:
optuna_logregr = study.best_params
print(optuna_logregr)

{'C': 0.007294894989290226, 'penalty': 'l1', 'solver': 'saga', 'class_weight': None}


### Random Forest

In [38]:
def objective_optuna_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 30),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 15),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 1000),
        'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 0.1),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'random_state': 42
    }
    
    model = RandomForestClassifier(**params)
    model.fit(X_ss_train, y_ss_train)
    
    y_pred = model.predict(X_ss_test)
    f1 = f1_score(y_ss_test, y_pred)
    
    return f1

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_optuna_rf, n_trials=50, show_progress_bar=True)

print("Best parameters:", study_rf.best_params)
print("Best F1-score:", study_rf.best_value)

[I 2025-11-03 19:24:33,542] A new study created in memory with name: no-name-b8d40074-5c27-47ad-88c1-94cd821b9e96


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-03 19:24:37,183] Trial 0 finished with value: 0.8014705882352942 and parameters: {'n_estimators': 437, 'max_depth': 20, 'min_samples_split': 21, 'min_samples_leaf': 1, 'max_features': None, 'max_leaf_nodes': 668, 'min_impurity_decrease': 0.012968750399185791, 'bootstrap': False, 'criterion': 'entropy'}. Best is trial 0 with value: 0.8014705882352942.
[I 2025-11-03 19:24:38,557] Trial 1 finished with value: 0.7986577181208053 and parameters: {'n_estimators': 555, 'max_depth': 12, 'min_samples_split': 19, 'min_samples_leaf': 3, 'max_features': 'log2', 'max_leaf_nodes': 626, 'min_impurity_decrease': 0.04628860109064634, 'bootstrap': False, 'criterion': 'entropy'}. Best is trial 0 with value: 0.8014705882352942.
[I 2025-11-03 19:24:40,374] Trial 2 finished with value: 0.7934426229508197 and parameters: {'n_estimators': 879, 'max_depth': 20, 'min_samples_split': 23, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_leaf_nodes': 34, 'min_impurity_decrease': 0.056291774338624714

In [54]:
optuna_rf = study_rf.best_params
optuna_rf_score = study_rf.best_value
print("Best random forest params: ",optuna_rf)
print("Best random forest f1-score",optuna_rf_score)

Best random forest params:  {'n_estimators': 769, 'max_depth': 5, 'min_samples_split': 13, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_leaf_nodes': 842, 'min_impurity_decrease': 0.011588676266373742, 'bootstrap': True, 'criterion': 'gini'}
Best random forest f1-score 0.8185185185185185


### Xgboost

In [48]:
def objective_optuna_xgboost(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'random_state': 42
    }
    
    model = xgb.XGBClassifier(**params)
    model.fit(X_ss_train, y_ss_train)
    
    y_pred = model.predict(X_ss_test)
    f1 = f1_score(y_ss_test, y_pred)
    
    return f1

study_xgboost = optuna.create_study(direction='maximize')
study_xgboost.optimize(objective_optuna_xgboost, n_trials=50, show_progress_bar=True)

print("Best parameters:", study_xgboost.best_params)
print("Best F1-score:", study_xgboost.best_value)

[I 2025-11-03 19:47:10,972] A new study created in memory with name: no-name-fa3d0f8a-5e6d-4276-acb6-369540dcf844


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-03 19:47:11,752] Trial 0 finished with value: 0.7868217054263565 and parameters: {'n_estimators': 962, 'max_depth': 3, 'learning_rate': 0.20171161152284275, 'subsample': 0.9185157186154527, 'colsample_bytree': 0.910495941029367, 'reg_alpha': 0.9083873758700546, 'reg_lambda': 0.7789507213985059}. Best is trial 0 with value: 0.7868217054263565.
[I 2025-11-03 19:47:12,133] Trial 1 finished with value: 0.7749510763209393 and parameters: {'n_estimators': 197, 'max_depth': 9, 'learning_rate': 0.037741022582054445, 'subsample': 0.910138858488377, 'colsample_bytree': 0.8512535188117203, 'reg_alpha': 0.5044358214938897, 'reg_lambda': 0.06396572390801536}. Best is trial 0 with value: 0.7868217054263565.
[I 2025-11-03 19:47:12,571] Trial 2 finished with value: 0.7728155339805826 and parameters: {'n_estimators': 383, 'max_depth': 7, 'learning_rate': 0.1195683098559626, 'subsample': 0.9998546196894922, 'colsample_bytree': 0.6801390765022889, 'reg_alpha': 0.6331785700468525, 'reg_lambda':

In [53]:
print("Best xgboost parameters:", study_xgboost.best_params)
print("Best xgboost F1-score:", study_xgboost.best_value)

Best xgboost parameters: {'n_estimators': 101, 'max_depth': 7, 'learning_rate': 0.010466429197724884, 'subsample': 0.7517994792724202, 'colsample_bytree': 0.6403432379467009, 'reg_alpha': 0.654880575682208, 'reg_lambda': 0.40048317634836383}
Best xgboost F1-score: 0.7962264150943397


### CatBoost

In [50]:
def objective_optuna_catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 0.1, 2),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_state': 42,
        'verbose': False
    }
    
    model = CatBoostClassifier(**params)
    model.fit(X_ss_train, y_ss_train)
    
    y_pred = model.predict(X_ss_test)
    f1 = f1_score(y_ss_test, y_pred)
    
    return f1

study_catboost = optuna.create_study(direction='maximize')
study_catboost.optimize(objective_optuna_catboost, n_trials=50, show_progress_bar=True)

print("Best parameters:", study_catboost.best_params)
print("Best F1-score:", study_catboost.best_value)

[I 2025-11-03 19:53:51,528] A new study created in memory with name: no-name-57f622c8-d2f8-43ec-a813-f95a706d93f1


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-03 19:53:52,943] Trial 0 finished with value: 0.8 and parameters: {'iterations': 458, 'depth': 8, 'learning_rate': 0.06403048056219847, 'l2_leaf_reg': 8.95818667067491, 'random_strength': 1.3606232889452015, 'bagging_temperature': 0.26629269029161984, 'border_count': 84}. Best is trial 0 with value: 0.8.
[I 2025-11-03 19:53:54,144] Trial 1 finished with value: 0.7689320388349514 and parameters: {'iterations': 791, 'depth': 4, 'learning_rate': 0.09249041166553533, 'l2_leaf_reg': 1.640995674491994, 'random_strength': 0.17766086786352106, 'bagging_temperature': 0.959228774990283, 'border_count': 157}. Best is trial 0 with value: 0.8.
[I 2025-11-03 19:54:01,134] Trial 2 finished with value: 0.8062015503875969 and parameters: {'iterations': 975, 'depth': 10, 'learning_rate': 0.22156435955115417, 'l2_leaf_reg': 7.799036224811638, 'random_strength': 1.5602292862128597, 'bagging_temperature': 0.8203929005911199, 'border_count': 94}. Best is trial 2 with value: 0.8062015503875969.
[I

In [52]:
print("Best catboost parameters:", study_catboost.best_params)
print("Best catboost F1-score:", study_catboost.best_value)

Best catboost parameters: {'iterations': 108, 'depth': 10, 'learning_rate': 0.019573748732898265, 'l2_leaf_reg': 5.473367185401033, 'random_strength': 1.1506042155847689, 'bagging_temperature': 0.780195513041266, 'border_count': 181}
Best catboost F1-score: 0.8134328358208955


## Обучение моделей, выбор одной лучшей и версионирование результатов (ML Flow)

In [177]:
import os
os.environ['USER'] = 'Dima Golik'

In [180]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("CS2_Prediction_test")

2025/11/03 23:31:56 INFO mlflow.tracking.fluent: Experiment with name 'CS2_Prediction_test' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Desktop/Desktop/ML_Flow_Server/artefacts/207445062717580949', creation_time=1762201916097, experiment_id='207445062717580949', last_update_time=1762201916097, lifecycle_stage='active', name='CS2_Prediction_test', tags={}>

In [220]:
def experiment(run_name, model_class, params, train_features, train_target, test_features, test_target):
    model = None
    with mlflow.start_run(run_name=run_name):
        # Логируем параметры модели
        for name_param, param in params.items():
            mlflow.log_param(name_param, param)
        # Обучение модели
        model = model_class(**params)
        model.fit(train_features, train_target)
        # Предсказание 
        pred = model.predict(test_features)
        # Получаем метрики
        accuracy = accuracy_score(test_target, pred)
        conf_matrix = confusion_matrix(test_target, pred)
        recall = recall_score(test_target, pred)
        precision = precision_score(test_target, pred)
        # Логируем метрики
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("tn", conf_matrix[0][0])
        mlflow.log_metric("fp", conf_matrix[0][1])
        mlflow.log_metric("fn", conf_matrix[1][0])
        mlflow.log_metric("tp", conf_matrix[1][1])
        # Сохранение модели в MLflow
        mlflow.sklearn.log_model(model, "model")
    mlflow.end_run()
    return model

In [237]:
def check_thresholds(run_name, model, test_features, test_target, necessary_precision, description):
    with mlflow.start_run(run_name=run_name):
        y_scores = model.predict_proba(test_features)[:, 1]  # Берем вероятности для класса 1
        #  Считаем precision и recall для всех возможных порогов
        precisions, recalls, thresholds = precision_recall_curve(test_target, y_scores)
        # Находим ПЕРВЫЙ порог, где precision превышает нашу цель
        index = np.argmax(precisions > necessary_precision)
        target_threshold = thresholds[index]
        # Логируем все результаты в ML Flow
        for name_param, param in model.get_params().items():
            mlflow.log_param(name_param, param)
        mlflow.set_tag("model name", model.__class__.__name__)
        mlflow.set_tag("description", description)

        mlflow.log_metric("threshold", target_threshold)
        mlflow.log_metric("New precision", precisions[index])
        mlflow.log_metric("New recall", recalls[index])
        
        mlflow.sklearn.log_model(model, "model")
        
        print("=" * 50)
        print(f"Модель: {model.__class__.__name__}")
        print(f"ЦЕЛЕВОЙ Precision: > {target_precision}")
        print(f"НАЙДЕННЫЙ ПОРОГ: {target_threshold:.3f}")
        print(f"Precision при этом пороге: {precisions[index]:.3f}")
        print(f"Recall при этом пороге: {recalls[index]:.3f}")
        print("=" * 50)
    mlflow.end_run()
    return target_threshold

### Logistic regression

In [250]:
lr_params = {'C': 0.007294894989290226, 'penalty': 'l1', 'solver': 'saga', 'class_weight': None, 'random_state': 42}

In [251]:
run_lr = 'LogisticRegression_run'

In [252]:
description1 = "Находим порог при котором у нас precision будет > 0.9"

In [253]:
lr_model = experiment(
    run_name=run_lr, 
    model_class=LogisticRegression, 
    params=lr_params, 
    train_features=X_ss_train,
    train_target=y_ss_train,
    test_features=X_ss_test,
    test_target=y_ss_test
)



🏃 View run LogisticRegression_run at: http://localhost:5000/#/experiments/207445062717580949/runs/8ba9959e24bb4f7680a121ef121b49fc
🧪 View experiment at: http://localhost:5000/#/experiments/207445062717580949


In [254]:
lr_model.get_params()

{'C': 0.007294894989290226,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l1',
 'random_state': 42,
 'solver': 'saga',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [255]:
run_lr_thrsld = 'LogisticRegression_thresholds_run'

In [256]:
threshold = check_thresholds(
    run_name=run_lr_thrsld,
    model=lr_model,
    test_features=X_ss_test,
    test_target=y_ss_test,
    necessary_precision=0.9,
    description=description1
)



Модель: LogisticRegression
ЦЕЛЕВОЙ Precision: > 0.9
НАЙДЕННЫЙ ПОРОГ: 0.598
Precision при этом пороге: 0.901
Recall при этом пороге: 0.502
🏃 View run LogisticRegression_thresholds_run at: http://localhost:5000/#/experiments/207445062717580949/runs/619c1cfd962d42e095ae27d4deca6ef0
🧪 View experiment at: http://localhost:5000/#/experiments/207445062717580949


In [263]:
threshold = check_thresholds(
    run_name=run_lr_thrsld,
    model=lr_model,
    test_features=X_ss_test,
    test_target=y_ss_test,
    necessary_precision=0.87,
    description=description1
)



Модель: LogisticRegression
ЦЕЛЕВОЙ Precision: > 0.9
НАЙДЕННЫЙ ПОРОГ: 0.572
Precision при этом пороге: 0.871
Recall при этом пороге: 0.608
🏃 View run LogisticRegression_thresholds_run at: http://localhost:5000/#/experiments/207445062717580949/runs/ad96576879c047d986dbbda7c72cb69b
🧪 View experiment at: http://localhost:5000/#/experiments/207445062717580949


### RandomForest

In [257]:
rf_params = {'random_state': 42, 'n_estimators': 769, 'max_depth': 5, 'min_samples_split': 13, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_leaf_nodes': 842, 'min_impurity_decrease': 0.011588676266373742, 'bootstrap': True, 'criterion': 'gini'}

In [258]:
description_rf = "Ищем порог для RandomForest при котором precision > 0.9"

In [259]:
run_rf = 'RandomForest_run'

In [260]:
rf_model = experiment(
    run_name=run_rf, 
    model_class=RandomForestClassifier, 
    params=rf_params, 
    train_features=X_ss_train,
    train_target=y_ss_train,
    test_features=X_ss_test,
    test_target=y_ss_test
)



🏃 View run RandomForest_run at: http://localhost:5000/#/experiments/207445062717580949/runs/406bc15ea11741708f9cb82afb08557f
🧪 View experiment at: http://localhost:5000/#/experiments/207445062717580949


In [261]:
run_rf_thrsld = 'RandomForest_threshold_run'

In [262]:
rf_threshold = check_thresholds(
    run_name=run_rf_thrsld,
    model=rf_model,
    test_features=X_ss_test,
    test_target=y_ss_test,
    necessary_precision=0.9,
    description=description_rf
)



Модель: RandomForestClassifier
ЦЕЛЕВОЙ Precision: > 0.9
НАЙДЕННЫЙ ПОРОГ: 0.706
Precision при этом пороге: 0.904
Recall при этом пороге: 0.443
🏃 View run RandomForest_threshold_run at: http://localhost:5000/#/experiments/207445062717580949/runs/acd20ada989d4ea59fbe1bcb14a624ba
🧪 View experiment at: http://localhost:5000/#/experiments/207445062717580949


### Сравнение