In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
import joblib

In [1]:

def load_and_prepare_data(train_df_dir, seeds_number):
    train_df = []
    for i in range(seeds_number):
        seed_df = pd.read_csv(train_df_dir + f'seir_seed_{i}.csv')
        seed_df = seed_df[pd.notna(seed_df['Beta'])]
        seed_df['prev_I'] = seed_df['I'].shift(-2).fillna(0)
        seed_df['seed'] = i
        seed_df['day'] = np.arange(len(seed_df))
        train_df.append(seed_df)
    
    train_df = pd.concat(train_df, ignore_index=True)
    train_df = train_df[train_df['Beta'] > 0].copy()
    train_df['log_Beta'] = np.log(train_df['Beta'])
    return train_df

def train_model(train_df, model_file):
    X = train_df[['day', 'S', 'E', 'I', 'R', 'prev_I']].values
    y = train_df['log_Beta'].values
    
    model = make_pipeline(
        StandardScaler(),
        PolynomialFeatures(include_bias=False, degree=3),
        SGDRegressor(max_iter=5000, penalty='l2', alpha=0.1, 
                     warm_start=False))
    model.fit(X, y)

    
    # Сохраняем лучшую модель
    joblib.dump(model, model_file)

train_df_dir = f'data/train/'
seeds_number = 1200
model_file = 'regression_day_SEIR_prev_I_for_seir.joblib'

train_df = load_and_prepare_data(train_df_dir, seeds_number)
train_model(train_df, model_file)


NameError: name 'pd' is not defined

In [3]:

def load_and_prepare_data(train_df_dir, seeds_number):
    train_df = []
    for i in range(seeds_number):
        seed_df = pd.read_csv(train_df_dir + f'seir_seed_{i}.csv')
        seed_df = seed_df[pd.notna(seed_df['Beta'])]
        seed_df['prev_I'] = seed_df['I'].shift(-2).fillna(0)
        seed_df['seed'] = i
        seed_df['day'] = np.arange(len(seed_df))
        train_df.append(seed_df)
    
    train_df = pd.concat(train_df, ignore_index=True)
    train_df = train_df[train_df['Beta'] > 0].copy()
    train_df['log_Beta'] = np.log(train_df['Beta'])
    return train_df

def train_model(train_df, model_file):
    X = train_df[['day', 'S', 'E', 'I', 'R', 'prev_I']].values
    y = train_df['log_Beta'].values
    
    # Параметры для grid search
    param_grid = {
        'sgdregressor__alpha': [0.1, 1],   # Параметр регуляризации
        'sgdregressor__max_iter': [5000],  # Максимальное количество итераций
        'sgdregressor__penalty': ['l2', 'l1', None],  # Разные типы регуляризации 
    }

    model = make_pipeline(
        StandardScaler(),
        PolynomialFeatures(include_bias=False, degree=3),
        SGDRegressor(warm_start=False)
    )

    # Настройка GridSearchCV с использованием 5-кратной кросс-валидации и оценкой по MSE
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X, y)

    # Выводим лучшие параметры и их результат
    print(f"Лучшие параметры: {grid_search.best_params_}")
    print(f"Лучший результат (негативная среднеквадратичная ошибка): {grid_search.best_score_}")
    
    # Сохраняем лучшую модель
    joblib.dump(grid_search.best_estimator_, model_file)

train_df_dir = f'data/train/'
seeds_number = 1200
model_file = 'regression_day_SEIR_prev_I_for_seir.joblib'

train_df = load_and_prepare_data(train_df_dir, seeds_number)
train_model(train_df, model_file)


Лучшие параметры: {'sgdregressor__alpha': 0.1, 'sgdregressor__max_iter': 5000, 'sgdregressor__penalty': 'l2'}
Лучший результат (негативная среднеквадратичная ошибка): -0.22562586163866385
