In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# --- PART 1: DATA PREPARATION (Assuming you ran the previous steps) ---
# We assume 'kaggle_pairs' and 'df_local' are already loaded from your previous cells.
# kaggle_pairs = process_kaggle_data(df)
# df_local = pd.read_csv("localdataset.csv")

# --- PART 2: BASE MODEL (KAGGLE) - TRAINING & EVALUATION ---

print("--- 1. Training Base Model (Random Forest) ---")

# 1. Split Kaggle Data into Train (80%) and Test (20%)
# This allows us to check accuracy on data the model hasn't seen.
X_k = kaggle_pairs[['Strength_7']]
y_k = kaggle_pairs['Strength_28']

X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(X_k, y_k, test_size=0.2, random_state=42)

# 2. Train Random Forest Base Model
base_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
base_model.fit(X_train_k, y_train_k)

# 3. Check Accuracy (On Test Data)
preds_k = base_model.predict(X_test_k)
mae_k = mean_absolute_error(y_test_k, preds_k)
r2_k = r2_score(y_test_k, preds_k)

print(f"Base Model Accuracy (Lab Conditions):")
print(f"   > MAE: {mae_k:.2f} MPa (Average Error)")
print(f"   > R2 Score: {r2_k:.2f} (1.0 is perfect)")

# OPTIONAL: Retrain on ALL data for the final pipeline to get maximum performance
base_model.fit(X_k, y_k) 


# --- PART 3: CORRECTION MODEL (LOCAL) - TRAINING & EVALUATION ---

print("\n--- 2. Training Correction Model (Temperature) ---")

# 1. Create Base Predictions (Lab Standard) for Local Data
# NOTE: Using DataFrame to avoid the warning you saw earlier
input_base_local = df_local[['Strength_7']] # Use double brackets to keep it a DataFrame
df_local['Base_Pred_Lab'] = base_model.predict(input_base_local)

# 2. Calculate Residuals (Actual - Base)
df_local['Residual'] = df_local['Strength_28'] - df_local['Base_Pred_Lab']

# 3. Split Local Data for Evaluation
X_l = df_local[['avgTemp']]
y_l = df_local['Residual']

X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X_l, y_l, test_size=0.2, random_state=42)

# 4. Train Correction Model
correction_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
correction_model.fit(X_train_l, y_train_l)

# 5. Evaluate the Correction
preds_l_resid = correction_model.predict(X_test_l)
mae_l = mean_absolute_error(y_test_l, preds_l_resid)

print(f"Correction Model Accuracy (Predicting the Error):")
print(f"   > MAE: {mae_l:.2f} MPa")


# --- PART 4: FINAL SYSTEM ACCURACY ---

print("\n--- 3. Final System Accuracy (Combined) ---")

# Let's see how well the Combined System predicts the Test Set
# Final = Base_Model(Strength) + Correction_Model(Temp)

# Get predictions for the Local Test Set
base_preds_test = base_model.predict(df_local.loc[X_test_l.index, ['Strength_7']])
corr_preds_test = correction_model.predict(X_test_l)

final_predictions = base_preds_test + corr_preds_test
actual_values = df_local.loc[X_test_l.index, 'Strength_28']

final_mae = mean_absolute_error(actual_values, final_predictions)
final_r2 = r2_score(actual_values, final_predictions)

print(f"FINAL COMBINED MODEL ACCURACY:")
print(f"   > Mean Absolute Error: {final_mae:.2f} MPa")
print(f"   > R2 Score: {final_r2:.2f}")

if final_r2 > 0.8:
    print("\n✅ Result: The model is performing very well!")
elif final_r2 > 0.5:
    print("\n⚠️ Result: The model is okay, but could be improved.")
else:
    print("\n❌ Result: The model is struggling. Check your data quality.")

--- 1. Training Base Model (Random Forest) ---


NameError: name 'kaggle_pairs' is not defined