In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import requests
import os
import random
import warnings
# Suppress warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    import shap

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score, mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] =False

In [2]:
# Read the CSV
df = pd.read_csv('C:/Users/User/Downloads/Kyushu Datasheets/data_with_press_and_weather.csv', low_memory=False)

# Drop rows with excessive missing values
missing_count = df.isnull().sum().sum()
if missing_count > 0:
    threshold = df.shape[1] // 2
    df = df.dropna(thresh=threshold + 1)
    print(f"Rows with more than half NaN dropped. Remaining rows: {len(df)}")
else:
    print("No missing values found.")

Rows with more than half NaN dropped. Remaining rows: 2811


In [3]:
# Clean and filter '강번'
df['강번'] = pd.to_numeric(df['강번'], errors='coerce')
df = df.dropna(subset=['강번'])
df = df[df['강번'].between(9000, 500000000)]
df = df[df['수율'].between(60, 100)]

df.to_csv('For_my_use_1.csv', index=False)

# Make sure the date column is in datetime format
df['날짜'] = pd.to_datetime(df['날짜'], errors='coerce')

In [4]:
# Target variable
target = 'CC_P'

'''
# Outlier removal
if 'Precipitation (mm)' in df.columns:
    initial_count_oxygen = len(df)
    df = df[df['Precipitation (mm)'] <= 10]
    removed_count_oxygen = initial_count_oxygen - len(df)
    print(f"Removed {removed_count_oxygen} rows where {'Precipitation (mm)'} > 10.")
else:
    print(f"Warning: '{'Precipitation (mm)'}' column not found.")
'''
# chin_variables = [col for col in df.columns if '친' in col and '친단' not in col]
# df['chin_total'] = df[chin_variables].sum(axis=1)

scraps_col = [
    '스크랩_S01_친',
    '스크랩_S01_자',
    # '스크랩_S01_손', # for GPR
    'A부스러기　친',
    'A부스러기 자',
    'A부스러기 손',
    # '친단　친', # for GPR
    '친단　자',
    '친단　손',
    '와셔 친',
    # '와셔 자', # for GPR
    # '와셔 손', # for GPR
    'B부스러기　친',
    'B부스러기 자',
    'B부스러기 손',
    '선다라이 친',
    # '선다라이 자', # for GPR
    # '선다라이 손', # for GPR
    # '류선 친', # for GPR
    '류선 자',
    # '류선 손', # for GPR
    '시타마와리(자동차 부품으로 예상됨)　친',
    # '시타마와리(자동차 부품으로 예상됨)　자', # for GPR
    # '시타마와리(자동차 부품으로 예상됨)　손', # for GPR
    '엔진 친',
    # '엔진 자',
    # '엔진 손',
    '슈레더 친',
    '슈레더 자',
    '슈레더 손',
    'C프레스 친',
    'C프레스 자',
    # 'C프레스 손', # for GPR
    # '다라이가루 친', # for GPR
    '다라이가루 자',
    '다라이가루 손',
    '강류 친',
    # '강류 자', # for GPR
    # '강류 손', # for GPR
    '타이어 친',
    # '타이어　자', # for GPR
    # '타이어　손', # for GPR
    '주물　친',
    # '주물　자', # for GPR
    # '주물　손', # for GPR
    # '알루미더스트　친', # for GPR
    # '알루미더스트　자', # for GPR
    # '알루미더스트　손', # for GPR
    # 'AB프레스　친', # for GPR
    # 'AB프레스　자', # for GPR
    # 'AB프레스　손', # for GPR
    # 'Mn강　친', # for GPR
    # 'Mn강　자', # for GPR
    # 'Mn강　손', # for GPR
    # '캔버서　친', # for GPR
    # '캔버서　자', # for GPR
    # '캔버서　손', # for GPR
    # '페인트캔　친', # for GPR
    # '페인트캔　자', # for GPR
    # '페인트캔　손', # for GPR
    'V프레스　친',
    # 'V프레스　자', # for GPR
    # 'V프레스　손', # for GPR
    # '캔프레스　친', # for GPR
    '캔프레스　자',
    # '캔프레스　손', # for GPR
    '빌렛부스러기　친',
    # '빌렛부스러기 자', # for GPR
    # '빌렛부스러기 손', # for GPR
    # 'SC（레들）바탕쇠　친', # for GPR
    'SC（레들）바탕쇠　자',
    # 'SC（레들）바탕쇠　손', # for GPR
    # 'EF바탕쇠　친', # for GPR
    'EF바탕쇠　자',
    # 'EF바탕쇠　손', # for GPR
    # 'CC바탕쇠　친', # for GPR
    # 'CC바탕쇠　자', # for GPR
    # 'CC바탕쇠　손', # for GPR
    '정비부스러기　친',
    # '정비부스러기　자', # for GPR
    # '정비부스러기　손', # for GPR
]

chin_variables = [col for col in df.columns if '친' in col and '친단' not in col]
ja_variables = [col for col in df.columns if '자' in col]
son_variables = [col for col in df.columns if '손' in col]

scraps = chin_variables + ja_variables + son_variables

# variables = []
X = df[scraps_col + ['MD_C', 'MD_Mn', 'MD_Cr', 'MD_Si', 'MD_S', '장입량t', 
                     'TAP-TAP', '온도', 'CaO', '사용전력량', 'O2사용량_합계'
                     ]]
y = df[target]

In [5]:
# Feature pipeline
# 훈련 데이터의 강종별 'C-inj(佐山)' 평균과 표준편차를 저장함.
# 이후 for 문을 돌면서 현재 샘플의 강종명을 저장해두고,
# 강종명이 일치하고, 정합성을 만족하는 이전 시점 샘플 탐색.
# 탐색에 성공한 경우 해당 샘플의 'C-inj(佐山)'을 불러와서 현재 샘플 값으로 사용함.
# 탐색에 실패한 경우 저장된 훈련 데이터의 강종별 평균을 사용함.
# 평균값도 없으면 해당 인스턴스의 'C-inj(佐山)'은 NaN 값으로 채움.

# 정합성 만족 기준은 평균 +- n 표준편차 이내

def feature_pipeline(df, additives_col, etc_col, train_df_for_stats=None, time_col='time', n_std=3, drop_na=True):
    """
    Performs feature engineering on a given dataframe.
    - If train_df_for_stats is provided (inference mode), it uses stats from that dataframe.
    - If not (training mode), it calculates stats from the input df itself.

    The logic is as follows:
    1. Calculate mean and std for each steel grade from the stats source.
    2. For each row in the target df, find the value from the previous heat of the same grade.
    3. If the previous value is valid (within n_std of the grade's mean), use it.
    4. If not, use the grade's average as a fallback.
    5. Drop rows with remaining NaNs if drop_na is True.
    """
    # Create a copy to avoid modifying the original DataFrame
    df_processed = df.copy()

    # --- 1. Determine the source for statistics and calculate them ---
    if train_df_for_stats is not None:
        # Inference mode: Use the provided training data to calculate stats
        stats_source_df = train_df_for_stats
        print("Running in INFERENCE mode: Using stats from provided train_df.")
    else:
        # Training mode: Use the input dataframe itself to calculate stats
        stats_source_df = df_processed
        print("Running in TRAINING mode: Using stats from the input df itself.")
        
    # Calculate stats for the validity check (mean and std from the stats source)
    grade_stats = stats_source_df.groupby('강종명')[additives_col].agg(['mean', 'std'])

    # --- 2. Apply feature engineering to the target dataset ---
    # Sort the target dataframe by time to correctly identify the previous heat
    df_processed.sort_values(by=time_col, inplace=True)

    # Loop through each additive column
    for col in additives_col:
        # Get the value from the previous heat of the same grade
        prev_values = df_processed.groupby('강종명')[col].shift(1)

        # Get the corresponding stats for each row's grade, using the stats from the source
        mean_values = df_processed['강종명'].map(grade_stats[col]['mean'])
        std_values = df_processed['강종명'].map(grade_stats[col]['std'])

        # --- Apply the validation logic ---
        # Default to using the grade's average value
        new_col = mean_values.copy()

        # Calculate validity bounds using the source stats
        lower_bound = mean_values - n_std * std_values
        upper_bound = mean_values + n_std * std_values

        # Create a mask for rows where the previous value is valid
        is_valid_prev = (prev_values.notna()) & \
                        (prev_values >= lower_bound) & \
                        (prev_values <= upper_bound)

        # --- Print count of times where mean is used ---
        mean_used_mask = ~is_valid_prev
        count_mean_used = mean_used_mask.sum()
        if count_mean_used > 0:
            print(f"  - Using mean for '{col}' {count_mean_used} times.")

        # Where the mask is True, update the new column with the valid previous value
        new_col.loc[is_valid_prev] = prev_values[is_valid_prev]
        
        # Replace the original column with the newly calculated values
        df_processed[col] = new_col

    # 3. If drop_na is True, remove rows that couldn't be filled
    if drop_na:
        initial_rows = len(df_processed)
        df_processed.dropna(subset=additives_col, inplace=True)
        print(f'  - NaN instances dropped: {initial_rows - len(df_processed)}')

    # 4. Drop etc_col as specified
    df_final = df_processed.drop(columns=etc_col)
    
    return df_final, df_processed

In [6]:
# 훈련/테스트 분할.
# 훈련 데이터는 2024년 1월~9월, 테스트 데이터는 2024년 10월~12월 데이터로 설정함.
test_start_date = '2024-10-01'

# 3. Filter the DataFrame to create the training set
train_df = selected_df[selected_df['time'] < test_start_date]

# 4. Filter the DataFrame to create the test set
test_df = selected_df[selected_df['time'] >= test_start_date]



NameError: name 'selected_df' is not defined

In [None]:
# train_df.to_csv('temp_train_df_before.csv', encoding='utf-8-sig')
# test_df.to_csv('temp_test_df_before.csv', encoding='utf-8-sig')

# --- 시나리오 1: 모델 학습 (Training) ---
# train_data만 함수에 전달합니다.
# 이 경우, 함수는 train_data 자체의 통계치(평균, 표준편차)를 계산하여 피처 엔지니어링을 수행합니다.
print("--- PROCESSING TRAINING DATA ---")
processed_train_df, processed_train_df_with_etc_col = feature_pipeline(
    df=train_df,
    additives_col=additives_col,
    etc_col=etc_col,
    )

# --- 시나리오 2: 모델 추론 (Inference) ---
# test_data를 처리하기 위해 함수를 호출하지만, 통계치 계산의 기준이 될 train_data를 함께 전달합니다.
print("--- PROCESSING TEST DATA ---")
processed_test_df, processed_test_df_with_etc_col = feature_pipeline(
    df=test_df, # df: 모델의 학습 데이터 이후 시점의 데이터 (즉, 학습 데이터 마지막 시점 ~ 현재 추론 시점까지의 데이터)
    additives_col=additives_col,
    etc_col=etc_col,
    train_df_for_stats=train_df, # 모델의 학습 데이터
    )

# processed_train_df_with_etc_col.to_csv('temp_train_df_after.csv', encoding='utf-8-sig')
# processed_test_df_with_etc_col.to_csv('temp_test_df_after.csv', encoding='utf-8-sig')

In [None]:
# 데이터셋에서 time, target 컬럼 제거
X_train = processed_train_df.drop(columns=time_col + target_col)
X_test = processed_test_df.drop(columns=time_col + target_col)

y_train = processed_train_df[target_col]
y_test = processed_test_df[target_col]

In [None]:
# --- Model training and evaluation functions ---

def evaluate_model(name, model, X_train, y_train, X_test, y_test, df):
    print(f"\n--- {name} Model Evaluation ---")
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    # --- Calculate Metrics ---
    # Train
    mse_train = mean_squared_error(y_train, train_pred)
    rmse_train = np.sqrt(mse_train)
    mae_train = mean_absolute_error(y_train, train_pred)
    mape_train = np.mean(np.abs((y_train - train_pred) / y_train)) * 100
    r2_train = r2_score(y_train, train_pred)

    # Test
    mse_test = mean_squared_error(y_test, test_pred)
    rmse_test = np.sqrt(mse_test)
    mae_test = mean_absolute_error(y_test, test_pred)
    mape_test = np.mean(np.abs((y_test - test_pred) / y_test)) * 100
    r2_test = r2_score(y_test, test_pred)

    # --- Print Metrics ---
    print(f"{name} Train Scores:")
    print(f"  MSE:  {mse_train:.2f}")
    print(f"  RMSE: {rmse_train:.2f}")
    print(f"  MAE:  {mae_train:.2f}")
    print(f"  MAPE: {mape_train:.2f}%")
    print(f"  R²:   {r2_train:.3f}")

    print(f"\n{name} Test Scores:")
    print(f"  MSE:  {mse_test:.2f}")
    print(f"  RMSE: {rmse_test:.2f}")
    print(f"  MAE:  {mae_test:.2f}")
    print(f"  MAPE: {mape_test:.2f}%")
    print(f"  R²:   {r2_test:.3f}")
    
    # Plot
    train_df = df[['강번']].loc[X_train.index].copy()
    train_df['Actual'] = y_train.values
    train_df['Predicted'] = train_pred
    train_df['Set'] = 'Train'

    test_df = df[['강번']].loc[X_test.index].copy()
    test_df['Actual'] = y_test.values
    test_df['Predicted'] = test_pred
    test_df['Set'] = 'Test'

    combined_df = pd.concat([train_df, test_df]).sort_values('강번').reset_index(drop=True)
    combined_df['Time Index'] = range(len(combined_df))

    plt.figure(figsize=(16, 4))
    plt.plot(combined_df['Time Index'], combined_df['Actual'], label='Actual', linewidth=2)
    plt.plot(combined_df['Time Index'], combined_df['Predicted'], label='Predicted', linestyle='--', linewidth=2)
    test_start = combined_df[combined_df['Set'] == 'Test']['Time Index'].min()
    plt.axvline(x=test_start, color='red', linestyle=':', label='Test Start')
    plt.xlabel('Time Index (chronological)')
    plt.ylabel(target)
    plt.title(f'Actual vs Predicted {target} ({name}) - R² = {r2_test:.3f}, RMSE = {rmse_test:.1f}')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Test-only plot
    test_df = test_df.sort_values('강번').reset_index(drop=True)
    test_df['Time Index'] = range(len(test_df))

    plt.figure(figsize=(18, 3))
    plt.plot(test_df['Time Index'], test_df['Actual'], label='Actual', linewidth=2)
    plt.plot(test_df['Time Index'], test_df['Predicted'], label='Predicted', linestyle='--', linewidth=2)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# --- Linear Regression ---
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
evaluate_model("Linear Regression", lin_model, X_train, y_train, X_test, y_test, df)

In [None]:
# --- XGBoost ---
xgb_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,
    learning_rate=0.005,
    max_depth=4,
    random_state=42
)
xgb_model.fit(X_train, y_train)
evaluate_model("XGBoost", xgb_model, X_train, y_train, X_test, y_test, df)

In [None]:
# --- Random Forest ---
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
evaluate_model("Random Forest", rf_model, X_train, y_train, X_test, y_test, df)

In [None]:

# === Added on 2025-08-08 04:07:56: safer evaluator for SVM/ANN (works with 1D/2D preds) ===
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(name, model, X_train, y_train, X_test, y_test, df):
    print(f"\n--- {name} Model Evaluation ---")
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    # Ensure 1-D arrays
    train_pred = np.ravel(train_pred)
    test_pred = np.ravel(test_pred)

    # Convert y to 1-D numpy arrays (handles Series/DataFrame/ndarray)
    ytr = np.ravel(np.array(y_train))
    yte = np.ravel(np.array(y_test))

    # --- Metrics ---
    mse_train = mean_squared_error(ytr, train_pred)
    rmse_train = np.sqrt(mse_train)
    mae_train = mean_absolute_error(ytr, train_pred)
    # Safe MAPE
    denom_tr = np.where(ytr == 0, np.finfo(float).eps, ytr)
    mape_train = np.mean(np.abs((ytr - train_pred) / denom_tr)) * 100
    r2_train = r2_score(ytr, train_pred)

    mse_test = mean_squared_error(yte, test_pred)
    rmse_test = np.sqrt(mse_test)
    mae_test = mean_absolute_error(yte, test_pred)
    denom_te = np.where(yte == 0, np.finfo(float).eps, yte)
    mape_test = np.mean(np.abs((yte - test_pred) / denom_te)) * 100
    r2_test = r2_score(yte, test_pred)

    # --- Print ---
    print(f"{name} Train Scores:")
    print(f"  MSE:  {mse_train:.2f}")
    print(f"  RMSE: {rmse_train:.2f}")
    print(f"  MAE:  {mae_train:.2f}")
    print(f"  MAPE: {mape_train:.2f}%")
    print(f"  R²:   {r2_train:.3f}")

    print(f"\n{name} Test Scores:")
    print(f"  MSE:  {mse_test:.2f}")
    print(f"  RMSE: {rmse_test:.2f}")
    print(f"  MAE:  {mae_test:.2f}")
    print(f"  MAPE: {mape_test:.2f}%")
    print(f"  R²:   {r2_test:.3f}")

    # --- Plots ---
    # Assumes df has a '강번' column and X indices align with df
    train_df = df[['강번']].loc[X_train.index].copy()
    train_df['Actual'] = ytr
    train_df['Predicted'] = train_pred
    train_df['Set'] = 'Train'

    test_df = df[['강번']].loc[X_test.index].copy()
    test_df['Actual'] = yte
    test_df['Predicted'] = test_pred
    test_df['Set'] = 'Test'

    combined_df = pd.concat([train_df, test_df]).sort_values('강번').reset_index(drop=True)
    combined_df['Time Index'] = range(len(combined_df))

    plt.figure(figsize=(16, 4))
    plt.plot(combined_df['Time Index'], combined_df['Actual'], label='Actual', linewidth=2)
    plt.plot(combined_df['Time Index'], combined_df['Predicted'], label='Predicted', linestyle='--', linewidth=2)
    test_start = combined_df[combined_df['Set'] == 'Test']['Time Index'].min()
    if pd.notnull(test_start):
        plt.axvline(x=test_start, color='red', linestyle=':', label='Test Start')
    plt.xlabel('Time Index (chronological)')
    plt.ylabel(target)
    plt.title(f'Actual vs Predicted {target} ({name}) - R² = {r2_test:.3f}, RMSE = {rmse_test:.1f}')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Test-only plot
    test_df = test_df.sort_values('강번').reset_index(drop=True)
    test_df['Time Index'] = range(len(test_df))

    plt.figure(figsize=(18, 3))
    plt.plot(test_df['Time Index'], test_df['Actual'], label='Actual', linewidth=2)
    plt.plot(test_df['Time Index'], test_df['Predicted'], label='Predicted', linestyle='--', linewidth=2)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    return test_df


In [None]:

# === Added on 2025-08-08 04:07:56: SVM (SVR) and ANN (MLPRegressor) implementations ===
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

# SVR pipeline (scale features)
svr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR(kernel='rbf', C=10.0, epsilon=0.1, gamma='scale'))
])

svr_pipeline.fit(X_train, y_train)
svr_test_df = evaluate_model("SVR (RBF)", svr_pipeline, X_train, y_train, X_test, y_test, df)

# ANN pipeline (scale features)
ann_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPRegressor(hidden_layer_sizes=(128, 64),
                         activation='relu',
                         solver='adam',
                         learning_rate='adaptive',
                         max_iter=1000,
                         early_stopping=True,
                         random_state=42))
])

ann_pipeline.fit(X_train, y_train)
ann_test_df = evaluate_model("ANN (MLPRegressor)", ann_pipeline, X_train, y_train, X_test, y_test, df)
