In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Load Data
df = pd.read_csv("Data/zainfaisal_pakistan_concrete_data.csv")

# 1. DEFINE INGREDIENTS (Features for the Helper Model)
recipe_cols = [
    'Cement', 'Blast Furnace Slag', 'Fly Ash', 
    'Water', 'Superplasticizer', 
    'Coarse Aggregate', 'Fine Aggregate'
]

# Clean up column names just in case
df.columns = df.columns.str.strip()

# --- STEP 1: TRAIN THE HELPER MODEL ---
print("--- Step 1: Training Helper Model (Ingredients -> 7-Day Strength) ---")

# Filter for rows that actually have 7-day data
df_7 = df[df['Age'] == 7].copy()

if len(df_7) == 0:
    print("CRITICAL ERROR: No 7-day data found in file to train the helper!")
else:
    # Train the Helper Random Forest
    X_helper = df_7[recipe_cols]
    y_helper = df_7['Strength']
    
    helper_model = RandomForestRegressor(n_estimators=100, random_state=42)
    helper_model.fit(X_helper, y_helper)
    
    # Check accuracy of the helper itself
    helper_acc = mean_absolute_error(y_helper, helper_model.predict(X_helper))
    print(f"Helper Model Trained on {len(df_7)} rows.")
    print(f"Helper Accuracy (MAE): {helper_acc:.2f} MPa (Internal check)")


# --- STEP 2: PREPARE THE TARGET DATA (28-DAY ROWS) ---
print("\n--- Step 2: Imputing Missing 7-Day Data ---")

# Get all rows that are 28 days old (These are the ones we want to predict later)
df_28 = df[df['Age'] == 28].copy()

# We need to see if they already have a 7-day match (Real Pair)
# or if we need to fake it (Synthetic Pair)

# Strategy: We will predict 7-day strength for EVERY 28-day row first (Synthetic),
# and then overwrite it with Real data if we find a match.

# A. Generate Synthetic 7-Day Strength for ALL 28-day rows
# (This predicts what the 7-day strength likely was based on the recipe)
df_28['Predicted_Strength_7'] = helper_model.predict(df_28[recipe_cols])

# B. Try to find Real Matches (Optional but recommended for accuracy)
# We group by ingredients to handle the matching
df_7_grouped = df_7.groupby(recipe_cols)['Strength'].mean().reset_index()
df_7_grouped = df_7_grouped.rename(columns={'Strength': 'Real_Strength_7'})

# Merge the real 7-day data onto the 28-day data
# using the ingredients as the "Key"
df_augmented = pd.merge(
    df_28, 
    df_7_grouped, 
    on=recipe_cols, 
    how='left' # Keep all 28-day rows even if no match
)

# --- STEP 3: CREATE FINAL DATASET ---

# Logic: Use Real_Strength_7 if it exists; otherwise use Predicted_Strength_7
df_augmented['Strength_7'] = df_augmented['Real_Strength_7'].fillna(df_augmented['Predicted_Strength_7'])

# Create a flag so you know which is which
df_augmented['Type'] = np.where(df_augmented['Real_Strength_7'].notna(), 'Real Pair', 'Synthetic Pair')

# Final Cleanup
final_dataset = df_augmented[['Strength_7', 'Strength', 'Type']].copy()
final_dataset = final_dataset.rename(columns={'Strength': 'Strength_28'})

print("\n--- DATASET EXPANSION RESULTS ---")
print(f"Original Pairs Found: {len(df_augmented[df_augmented['Type'] == 'Real Pair'])}")
print(f"New Synthetic Pairs:  {len(df_augmented[df_augmented['Type'] == 'Synthetic Pair'])}")
print(f"TOTAL TRAINING DATA:  {len(final_dataset)} pairs")

# --- STEP 4: TRAIN YOUR MAIN MODEL ON THIS NEW DATA ---
# Now you can proceed with Strategy 1 using 'final_dataset'
# X = final_dataset[['Strength_7']]
# y = final_dataset['Strength_28']

--- Step 1: Training Helper Model (Ingredients -> 7-Day Strength) ---
Helper Model Trained on 126 rows.
Helper Accuracy (MAE): 1.71 MPa (Internal check)

--- Step 2: Imputing Missing 7-Day Data ---

--- DATASET EXPANSION RESULTS ---
Original Pairs Found: 123
New Synthetic Pairs:  302
TOTAL TRAINING DATA:  425 pairs
