In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import pickle

from constants import INPUT_DATA_COLUMNS, OUTPUT_DATA_COLUMNS, DATA_COLUMNS_TO_DELETE, DATA_PATH


def load_data(input_columns, output_data_columns):
    df = pd.read_excel(
        io=DATA_PATH,
        sheet_name='Dane',
        skiprows=1,
        nrows=520,
        usecols=input_columns + output_data_columns,
    )
    return df



In [20]:
def process_data(data, target_column, augmentation, feature_selection):
    column_mapping = {
    'C': 'c',
    'Si': 'si',
    'Mn': 'mn',
    'Mg': 'mg',
    'Cu': 'cu',
    'Ni': 'ni',
    'Mo': 'mo',
    'aust_temp': 'austTemp',
    'aust_czas': 'austTime',
    'ausf_temp': 'ausfTemp',
    'ausf_czas': 'ausfTime',
    'grubość [mm]': 'thickness'
    }
    data = data.dropna(subset=[target_column])
    X = data[INPUT_DATA_COLUMNS]
    X = X.rename(columns=column_mapping)
    y = data[target_column]
    # TODO: augmentation
    if augmentation:
        X = X.drop(DATA_COLUMNS_TO_DELETE, axis=1)
    # TODO: feature_selection
    if feature_selection:
        pass
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

    return X_train, X_test, y_train, y_test



In [21]:
def train_model(target_feature):
    pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', GradientBoostingRegressor(
        learning_rate=0.1,
        max_depth=5,
        max_features='sqrt',
        min_samples_leaf=1,
        min_samples_split=5,
        n_estimators=200))
    ])

    # Trenowanie modelu
    pipe.fit(X_train, y_train)

    # Przewidywanie na danych testowych
    y_pred = pipe.predict(X_test)

    # Obliczanie metryk RMSE i R^2
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    print(f'RMSE: {rmse}')
    print(f'R^2: {r2}')

    # Zapisywanie modelu do pliku .pkl
    results_file_name = f"models/{target_feature}.pkl"
    with open(results_file_name, 'wb') as file:
        pickle.dump(pipe, file)

In [22]:
PHYSICAL_PARAMETERS = ['Rm', 'Rp02', 'A5', 'HB', 'K']
for parameter in PHYSICAL_PARAMETERS:
    data = load_data(INPUT_DATA_COLUMNS, OUTPUT_DATA_COLUMNS)
    X_train, X_test, y_train, y_test = process_data(data, parameter, True, False)
    train_model(parameter)

RMSE: 93.25693806366635
R^2: 0.8419045517689184
RMSE: 103.8236026628904
R^2: 0.7470286736956242
RMSE: 1.6062289935395262
R^2: 0.7593678443093528
RMSE: 23.927898459215125
R^2: 0.8639863043967934
RMSE: 17.87902147504623
R^2: 0.8116852436793097


Note: you may need to restart the kernel to use updated packages.
