In [124]:
import os
import sys
sys.path.append('..')

In [1]:
import pandas as pd
import numpy as np
import joblib
import json

In [2]:
#Загрузка обработанных данных
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').squeeze()
y_test = pd.read_csv('../data/processed/y_test.csv').squeeze()

# Лог регрессия

In [111]:
def train_logreg(X_train, y_train, **params):
    from sklearn.linear_model import LogisticRegression
    
    default_params = {
        'class_weight': 'balanced',
        'random_state': 42,
        'max_iter': 1000,
        'solver': 'lbfgs'
    }
    default_params.update(params)
    
    model = LogisticRegression(**default_params)
    model.fit(X_train, y_train)
    
    return model

In [112]:
def predict_with_model(model, X, threshold=0.5):
    
    probabilities = model.predict_proba(X)[:, 1]
    predictions = (probabilities >= threshold).astype(int)
    
    return {
        'predictions': predictions.tolist(),
        'probabilities': probabilities.tolist(),
        'threshold': threshold
    }

In [113]:
def evaluate_model(model, predicts, y_test):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
    
    # Вычисляем метрики
    accuracy = accuracy_score(y_test, predicts['predictions'])
    precision = precision_score(y_test, predicts['predictions'], zero_division=0)
    recall = recall_score(y_test, predicts['predictions'], zero_division=0)
    f1 = f1_score(y_test, predicts['predictions'], zero_division=0)
    roc_auc = roc_auc_score(y_test, predicts['probabilities'])
    
    # Формируем результат
    metrics = {
        'model': type(model).__name__,
        'accuracy': float(accuracy),
        'precision': float(precision),
        'recall': float(recall),
        'f1': float(f1),
        'roc_auc' : float(roc_auc)
    }

    print(f"\nПредсказано 1: {sum(predicts['predictions'])} из {len(predicts['predictions'])}")
    print(f"Истинных 1:    {sum(y_test)} из {len(y_test)}")
    
    return metrics

In [114]:
baseline = train_logreg(X_train, y_train)

In [121]:
baseline_params = baseline.get_params()

In [115]:
predicts = predict_with_model(baseline, X_test, threshold = 0.7)

In [116]:
evaluate_model(baseline, predicts, y_test)


Предсказано 1: 1041 из 8238
Истинных 1:    928 из 8238


{'model': 'LogisticRegression',
 'accuracy': 0.880189366351056,
 'precision': 0.47166186359269935,
 'recall': 0.5290948275862069,
 'f1': 0.4987303199593702,
 'roc_auc': 0.8007282920184914}

In [123]:
#Сохранение результатов

joblib.dump(baseline, 'D:\\Projects\\bank_pet_project\\models\\baseline/baseline_logreg.pkl')


with open('D:\\Projects\\bank_pet_project\\models\\baseline/baseline_logreg_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

with open('D:\\Projects\\bank_pet_project\\models\\baseline/baseline_logreg_params.json', 'w') as f:
    json.dump(baseline_params, f, indent=2)    