In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
import joblib

In [2]:
def load_and_prepare_data(train_dir, val_dir, seeds_number_train, seeds_number_val):
    train_df = []
    for i in range(seeds_number_train):
        seed_df = pd.read_csv(train_dir + f'seir_seed_{i}.csv')
        seed_df['day'] = np.arange(len(seed_df))
        train_df.append(seed_df)
    for i in range(seeds_number_val):
        seed_df = pd.read_csv(val_dir + f'seir_seed_{i}.csv')
        seed_df['day'] = np.arange(len(seed_df))
        train_df.append(seed_df)
    
    train_df = pd.concat(train_df, ignore_index=True)
    train_df = train_df[train_df['Beta'] > 0].copy()
    train_df['log_Beta'] = np.log(train_df['Beta'])
    return train_df

def train_model(train_df, model_file):
    X = train_df[['day']].values
    y = train_df['log_Beta'].values

    model = make_pipeline(
        StandardScaler(),
        PolynomialFeatures(include_bias=False, degree=3),
        SGDRegressor(alpha=1, max_iter=5000, penalty=None, warm_start=False)
    )

    model.fit(X, y)

    joblib.dump(model, model_file)

train_dir = f'train/'
val_dir = f'val/'
model_file = 'regression_day_for_seir.joblib'
seeds_number_train = 960
seeds_number_val = 240

train_df = load_and_prepare_data(train_dir, val_dir, seeds_number_train, seeds_number_val)
train_model(train_df, model_file)