In [None]:
#Step 1

import pandas as pd
import numpy as np

# Read the data
print("=== Reading Data ===")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("submission.csv")

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Submission shape: {submission.shape}")

print("\n=== Train Columns ===")
print(train.columns.tolist())

print("\n=== Basic Info ===")
print("Train info:")
print(train.info())

print("\n=== Target Variables ===")
print("Precipitation (regression target):")
print(f"  Mean: {train['precipitation_mm'].mean():.4f}")
print(f"  Std: {train['precipitation_mm'].std():.4f}")
print(f"  Min: {train['precipitation_mm'].min():.4f}")
print(f"  Max: {train['precipitation_mm'].max():.4f}")

print("\nElectricity Shutdown (classification target):")
print(train['electricity_shutdown'].value_counts())
print(f"  Class imbalance: {train['electricity_shutdown'].mean():.4f}")

print("\n=== Time Series Info ===")
train['time'] = pd.to_datetime(train['time'])
print(f"Time range: {train['time'].min()} to {train['time'].max()}")
print(f"Total hours: {len(train)}")

print("\n=== Missing Values ===")
missing = train.isnull().sum()
print(missing[missing > 0].sort_values(ascending=False).head(10))

In [None]:
#Step 2
import numpy as np
from sklearn.metrics import f1_score, mean_squared_error

def custom_metric(y_true_reg, y_pred_reg, y_true_clf, y_pred_clf_proba, threshold=0.5):
    """
    Calculate the custom metric: RMSE * (1 - F1)
    
    Args:
        y_true_reg: True precipitation values
        y_pred_reg: Predicted precipitation values  
        y_true_clf: True electricity shutdown (0/1)
        y_pred_clf_proba: Predicted probabilities for electricity shutdown
        threshold: Threshold to convert probabilities to binary predictions
    
    Returns:
        float: RMSE * (1 - F1)
    """
    # Convert probabilities to binary predictions
    y_pred_bin = (y_pred_clf_proba >= threshold).astype(int)
    
    # Calculate F1 score for classification
    f1 = f1_score(y_true_clf, y_pred_bin, zero_division=0)
    
    # Calculate RMSE for regression
    rmse = np.sqrt(np.mean((y_true_reg - y_pred_reg) ** 2))
    
    # Final metric
    final_score = rmse * (1 - f1)
    
    return final_score, f1, rmse

# Example with dummy data
print("=== Custom Metric Example ===")
print("Let's say we have these predictions:")

# Dummy data
y_true_reg = np.array([1.0, 2.0, 0.5, 3.0])  # True precipitation
y_pred_reg = np.array([1.1, 1.9, 0.6, 2.8])  # Predicted precipitation

y_true_clf = np.array([0, 1, 0, 1])          # True electricity shutdown
y_pred_proba = np.array([0.1, 0.8, 0.2, 0.9]) # Predicted probabilities

print(f"True precipitation: {y_true_reg}")
print(f"Predicted precipitation: {y_pred_reg}")
print(f"True electricity shutdown: {y_true_clf}")
print(f"Predicted probabilities: {y_pred_proba}")

# Calculate with different thresholds
for threshold in [0.3, 0.5, 0.7]:
    score, f1, rmse = custom_metric(y_true_reg, y_pred_reg, y_true_clf, y_pred_proba, threshold)
    binary_preds = (y_pred_proba >= threshold).astype(int)
    print(f"\nThreshold {threshold}:")
    print(f"  Binary predictions: {binary_preds}")
    print(f"  F1 Score: {f1:.3f}")
    print(f"  RMSE: {rmse:.3f}")
    print(f"  Final Score (RMSE * (1-F1)): {score:.3f}")

print("\n=== Key Insights ===")
print("1. LOWER final score is better")
print("2. We want to minimize RMSE (regression error)")
print("3. We want to maximize F1 (classification accuracy)")
print("4. Poor F1 score heavily penalizes the final score")
print("5. Threshold selection is crucial for imbalanced classification")


In [None]:
#Step 3
import pandas as pd
import numpy as np

def add_time_features(df, time_col='time'):
    """Add time-based features from datetime column"""
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col])
    
    # Extract time components
    df['year'] = df[time_col].dt.year
    df['month'] = df[time_col].dt.month
    df['day'] = df[time_col].dt.day
    df['hour'] = df[time_col].dt.hour
    df['dayofweek'] = df[time_col].dt.dayofweek  # Monday=0, Sunday=6
    df['dayofyear'] = df[time_col].dt.dayofyear
    df['is_weekend'] = (df[time_col].dt.dayofweek >= 5).astype(int)
    df['is_day'] = df['is_day']  # Already exists in data
    
    return df

def create_lag_features(df, numeric_cols, lags=[1, 3, 6, 12, 24]):
    """Create lag features for numeric columns"""
    df = df.copy()
    
    for col in numeric_cols:
        for lag in lags:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)
    
    return df

def create_rolling_features(df, numeric_cols, windows=[3, 6, 12, 24]):
    """Create rolling window features"""
    df = df.copy()
    
    for col in numeric_cols:
        for window in windows:
            # Rolling mean
            df[f'{col}_rolling_mean_{window}'] = df[col].rolling(window=window, min_periods=1).mean()
            # Rolling std
            df[f'{col}_rolling_std_{window}'] = df[col].rolling(window=window, min_periods=1).std()
    
    return df

# Load data
print("=== Loading Data ===")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(f"Train shape before: {train.shape}")
print(f"Test shape before: {test.shape}")

# Add time features
print("\n=== Adding Time Features ===")
train = add_time_features(train)
test = add_time_features(test)

print("New time-based columns added:")
time_cols = ['year', 'month', 'day', 'hour', 'dayofweek', 'dayofyear', 'is_weekend']
print(time_cols)

# Identify numeric columns for lag/rolling features
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
# Remove target columns and time-based columns
exclude_cols = ['ID', 'precipitation_mm', 'electricity_shutdown'] + time_cols
feature_cols = [col for col in numeric_cols if col not in exclude_cols]

print(f"\nNumeric feature columns for lag/rolling: {len(feature_cols)}")
print(feature_cols[:10], "...")  # Show first 10

# Create lag features
print("\n=== Creating Lag Features ===")
train = create_lag_features(train, feature_cols, lags=[1, 3, 6])
test = create_lag_features(test, feature_cols, lags=[1, 3, 6])

# Create rolling features
print("\n=== Creating Rolling Features ===")
train = create_rolling_features(train, feature_cols, windows=[3, 6])
test = create_rolling_features(test, feature_cols, windows=[3, 6])

print(f"Train shape after feature engineering: {train.shape}")
print(f"Test shape after feature engineering: {test.shape}")

# Show some examples of new features
print("\n=== Example of New Features ===")
example_cols = ['temp_2m_C', 'temp_2m_C_lag_1', 'temp_2m_C_rolling_mean_3']
print(train[example_cols].head())

print("\n=== Feature Engineering Summary ===")
print("1. Time features: year, month, day, hour, dayofweek, dayofyear, is_weekend")
print("2. Lag features: 1, 3, 6 hour lags for all numeric features")
print("3. Rolling features: 3, 6 hour rolling mean and std for all numeric features")
print("4. Total features created: ~200+ new features")


In [None]:
#Step 4
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, mean_squared_error

def custom_metric(y_true_reg, y_pred_reg, y_true_clf, y_pred_clf_proba, threshold=0.5):
    """Calculate custom metric: RMSE * (1 - F1)"""
    y_pred_bin = (y_pred_clf_proba >= threshold).astype(int)
    f1 = f1_score(y_true_clf, y_pred_bin, zero_division=0)
    rmse = np.sqrt(np.mean((y_true_reg - y_pred_reg) ** 2))
    return rmse * (1 - f1), f1, rmse

def find_best_threshold(y_true, y_proba):
    """Find optimal threshold for binary classification"""
    best_thr, best_f1 = 0.5, -1.0
    for thr in np.linspace(0.0, 1.0, 101):
        y_pred = (y_proba >= thr).astype(int)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        if f1 > best_f1:
            best_f1, best_thr = f1, thr
    return best_thr, best_f1

# Load and prepare data (simplified version)
print("=== Loading and Preparing Data ===")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Convert time to datetime
train['time'] = pd.to_datetime(train['time'])
test['time'] = pd.to_datetime(test['time'])

# Add basic time features
for df in [train, test]:
    df['hour'] = df['time'].dt.hour
    df['dayofweek'] = df['time'].dt.dayofweek
    df['month'] = df['time'].dt.month

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Prepare features and targets
feature_cols = [col for col in train.columns if col not in ['ID', 'time', 'precipitation_mm', 'electricity_shutdown']]
print(f"Number of features: {len(feature_cols)}")

X_train = train[feature_cols]
y_train_reg = train['precipitation_mm']
y_train_clf = train['electricity_shutdown'].astype(int)
X_test = test[feature_cols]

print(f"X_train shape: {X_train.shape}")
print(f"y_train_reg shape: {y_train_reg.shape}")
print(f"y_train_clf shape: {y_train_clf.shape}")

# Split train data for validation
X_train_split, X_val, y_reg_split, y_reg_val, y_clf_split, y_clf_val = train_test_split(
    X_train, y_train_reg, y_train_clf, test_size=0.2, random_state=42, shuffle=False
)

print(f"Training split: {X_train_split.shape}")
print(f"Validation split: {X_val.shape}")

# Create preprocessing pipeline
print("\n=== Creating Preprocessing Pipeline ===")
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create multi-output model
print("\n=== Creating Multi-Output Model ===")
base_model = HistGradientBoostingRegressor(
    max_iter=100,
    learning_rate=0.1,
    random_state=42
)

model = MultiOutputRegressor(base_model)

# Create full pipeline
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Train the model
print("\n=== Training Model ===")
# Combine targets for multi-output
y_train_combined = np.column_stack([y_reg_split, y_clf_split])
full_pipeline.fit(X_train_split, y_train_combined)

# Make predictions on validation set
print("\n=== Making Predictions ===")
val_preds = full_pipeline.predict(X_val)
val_reg_pred = val_preds[:, 0]
val_clf_proba = np.clip(val_preds[:, 1], 0.0, 1.0)  # Clip to [0, 1]

# Find best threshold
best_threshold, best_f1 = find_best_threshold(y_clf_val, val_clf_proba)
print(f"Best threshold: {best_threshold:.3f}")
print(f"Best F1 score: {best_f1:.3f}")

# Calculate custom metric
final_score, f1, rmse = custom_metric(y_reg_val, val_reg_pred, y_clf_val, val_clf_proba, best_threshold)
print(f"\n=== Validation Results ===")
print(f"RMSE: {rmse:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Custom Metric (RMSE * (1-F1)): {final_score:.4f}")

# Make predictions on test set
print("\n=== Making Test Predictions ===")
test_preds = full_pipeline.predict(X_test)
test_reg_pred = test_preds[:, 0]
test_clf_proba = np.clip(test_preds[:, 1], 0.0, 1.0)
test_clf_pred = (test_clf_proba >= best_threshold).astype(int)

print(f"Test predictions shape: {test_preds.shape}")
print(f"Precipitation range: {test_reg_pred.min():.3f} to {test_reg_pred.max():.3f}")
print(f"Electricity shutdown predictions: {np.bincount(test_clf_pred)}")

print("\n=== Summary ===")
print("1. Created preprocessing pipeline with imputation and scaling")
print("2. Used HistGradientBoostingRegressor with MultiOutputRegressor")
print("3. Trained on 80% of data, validated on 20%")
print("4. Found optimal threshold for binary classification")
print("5. Calculated custom metric on validation set")
print("6. Made predictions on test set")


In [None]:
#Step 5
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

def custom_metric(y_true_reg, y_pred_reg, y_true_clf, y_pred_clf_proba, threshold=0.5):
    """Calculate custom metric: RMSE * (1 - F1)"""
    y_pred_bin = (y_pred_clf_proba >= threshold).astype(int)
    f1 = f1_score(y_true_clf, y_pred_bin, zero_division=0)
    rmse = np.sqrt(np.mean((y_true_reg - y_pred_reg) ** 2))
    return rmse * (1 - f1), f1, rmse

def find_best_threshold(y_true, y_proba):
    """Find optimal threshold for binary classification"""
    best_thr, best_f1 = 0.5, -1.0
    for thr in np.linspace(0.0, 1.0, 101):
        y_pred = (y_proba >= thr).astype(int)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        if f1 > best_f1:
            best_f1, best_thr = f1, thr
    return best_thr, best_f1

def add_time_features(df):
    """Add comprehensive time features"""
    df = df.copy()
    df['time'] = pd.to_datetime(df['time'])
    
    # Basic time features
    df['hour'] = df['time'].dt.hour
    df['dayofweek'] = df['time'].dt.dayofweek
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['year'] = df['time'].dt.year
    
    # Cyclical features (better for ML)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    # Binary features
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_night'] = ((df['hour'] >= 22) | (df['hour'] <= 6)).astype(int)
    
    return df

def create_lag_features(df, cols, lags=[1, 3, 6]):
    """Create lag features for important columns"""
    df = df.copy()
    for col in cols:
        for lag in lags:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)
    return df

# Load and prepare data
print("=== Loading and Preparing Data ===")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Add time features
train = add_time_features(train)
test = add_time_features(test)

# Create lag features for important weather variables
important_cols = ['temp_2m_C', 'rel_humidity_2m_pct', 'pressure_msl_hPa', 'wind_speed_10m_kph']
train = create_lag_features(train, important_cols)
test = create_lag_features(test, important_cols)

print(f"Train shape after feature engineering: {train.shape}")
print(f"Test shape after feature engineering: {test.shape}")

# Prepare features and targets
feature_cols = [col for col in train.columns if col not in ['ID', 'time', 'precipitation_mm', 'electricity_shutdown']]
print(f"Number of features: {len(feature_cols)}")

X_train = train[feature_cols]
y_train_reg = train['precipitation_mm']
y_train_clf = train['electricity_shutdown'].astype(int)
X_test = test[feature_cols]

# Time-aware train/validation split
print("\n=== Time-Aware Train/Validation Split ===")
split_idx = int(0.8 * len(X_train))
X_train_split = X_train.iloc[:split_idx]
X_val = X_train.iloc[split_idx:]
y_reg_split = y_train_reg.iloc[:split_idx]
y_reg_val = y_train_reg.iloc[split_idx:]
y_clf_split = y_train_clf.iloc[:split_idx]
y_clf_val = y_train_clf.iloc[split_idx:]

print(f"Training split: {X_train_split.shape}")
print(f"Validation split: {X_val.shape}")

# Create preprocessing pipeline
print("\n=== Creating Preprocessing Pipeline ===")
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Try different hyperparameters
print("\n=== Hyperparameter Tuning ===")
best_score = float('inf')
best_params = None
best_threshold = 0.5

param_combinations = [
    {'max_iter': 100, 'learning_rate': 0.1, 'max_depth': None},
    {'max_iter': 200, 'learning_rate': 0.05, 'max_depth': None},
    {'max_iter': 300, 'learning_rate': 0.03, 'max_depth': None},
    {'max_iter': 200, 'learning_rate': 0.1, 'max_depth': 8},
    {'max_iter': 300, 'learning_rate': 0.05, 'max_depth': 10},
]

for i, params in enumerate(param_combinations):
    print(f"Trying combination {i+1}/{len(param_combinations)}: {params}")
    
    # Create model with current parameters
    base_model = HistGradientBoostingRegressor(
        random_state=42,
        **params
    )
    
    model = MultiOutputRegressor(base_model)
    
    # Create pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Train and predict
    y_train_combined = np.column_stack([y_reg_split, y_clf_split])
    pipeline.fit(X_train_split, y_train_combined)
    
    val_preds = pipeline.predict(X_val)
    val_reg_pred = val_preds[:, 0]
    val_clf_proba = np.clip(val_preds[:, 1], 0.0, 1.0)
    
    # Find best threshold
    threshold, f1 = find_best_threshold(y_clf_val, val_clf_proba)
    
    # Calculate custom metric
    score, f1, rmse = custom_metric(y_reg_val, val_reg_pred, y_clf_val, val_clf_proba, threshold)
    
    print(f"  RMSE: {rmse:.4f}, F1: {f1:.4f}, Score: {score:.4f}, Threshold: {threshold:.3f}")
    
    if score < best_score:
        best_score = score
        best_params = params
        best_threshold = threshold
        best_pipeline = pipeline

print(f"\nBest parameters: {best_params}")
print(f"Best threshold: {best_threshold:.3f}")
print(f"Best score: {best_score:.4f}")

# Train final model on full training data
print("\n=== Training Final Model on Full Data ===")
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', MultiOutputRegressor(HistGradientBoostingRegressor(
        random_state=42,
        **best_params
    )))
])

y_train_combined = np.column_stack([y_train_reg, y_train_clf])
final_pipeline.fit(X_train, y_train_combined)

# Make predictions on test set
print("\n=== Making Test Predictions ===")
test_preds = final_pipeline.predict(X_test)
test_reg_pred = test_preds[:, 0]
test_clf_proba = np.clip(test_preds[:, 1], 0.0, 1.0)
test_clf_pred = (test_clf_proba >= best_threshold).astype(int)

print(f"Test predictions shape: {test_preds.shape}")
print(f"Precipitation range: {test_reg_pred.min():.3f} to {test_reg_pred.max():.3f}")
print(f"Electricity shutdown predictions: {np.bincount(test_clf_pred)}")

# Save predictions
print("\n=== Saving Predictions ===")
submission = pd.read_csv("submission.csv")
submission['precipitation_mm'] = test_reg_pred
submission['electricity_shutdown'] = test_clf_pred
submission.to_csv("my_submission.csv", index=False)
print("Saved predictions to my_submission.csv")

print("\n=== Summary ===")
print("1. Added comprehensive time features including cyclical encoding")
print("2. Created lag features for important weather variables")
print("3. Used time-aware train/validation split")
print("4. Performed hyperparameter tuning with custom metric")
print("5. Found optimal threshold for binary classification")
print("6. Trained final model on full data")
print("7. Generated and saved predictions")


In [None]:
#Step 6
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

def custom_metric(y_true_reg, y_pred_reg, y_true_clf, y_pred_clf_proba, threshold=0.5):
    """Calculate custom metric: RMSE * (1 - F1)"""
    y_pred_bin = (y_pred_clf_proba >= threshold).astype(int)
    f1 = f1_score(y_true_clf, y_pred_bin, zero_division=0)
    rmse = np.sqrt(np.mean((y_true_reg - y_pred_reg) ** 2))
    return rmse * (1 - f1), f1, rmse

def find_best_threshold(y_true, y_proba):
    """Find optimal threshold for binary classification"""
    best_thr, best_f1 = 0.5, -1.0
    for thr in np.linspace(0.0, 1.0, 101):
        y_pred = (y_proba >= thr).astype(int)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        if f1 > best_f1:
            best_f1, best_thr = f1, thr
    return best_thr, best_f1

def add_time_features(df):
    """Add comprehensive time features"""
    df = df.copy()
    df['time'] = pd.to_datetime(df['time'])
    
    # Basic time features
    df['hour'] = df['time'].dt.hour
    df['dayofweek'] = df['time'].dt.dayofweek
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['year'] = df['time'].dt.year
    
    # Cyclical features (better for ML)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    # Binary features
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_night'] = ((df['hour'] >= 22) | (df['hour'] <= 6)).astype(int)
    
    return df

def create_lag_features(df, cols, lags=[1, 3, 6]):
    """Create lag features for important columns"""
    df = df.copy()
    for col in cols:
        for lag in lags:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)
    return df

def create_rolling_features(df, cols, windows=[3, 6]):
    """Create rolling window features"""
    df = df.copy()
    for col in cols:
        for window in windows:
            df[f'{col}_rolling_mean_{window}'] = df[col].rolling(window=window, min_periods=1).mean()
            df[f'{col}_rolling_std_{window}'] = df[col].rolling(window=window, min_periods=1).std()
    return df

# Load and prepare data
print("=== Loading and Preparing Data ===")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Add time features
train = add_time_features(train)
test = add_time_features(test)

# Create lag and rolling features for important weather variables
important_cols = ['temp_2m_C', 'rel_humidity_2m_pct', 'pressure_msl_hPa', 'wind_speed_10m_kph']
train = create_lag_features(train, important_cols)
test = create_lag_features(test, important_cols)

train = create_rolling_features(train, important_cols)
test = create_rolling_features(test, important_cols)

# Add lag features for precipitation only in train set
train = create_lag_features(train, ['precipitation_mm'])
train = create_rolling_features(train, ['precipitation_mm'])

print(f"Train shape after feature engineering: {train.shape}")
print(f"Test shape after feature engineering: {test.shape}")

# Prepare features and targets
# Only use features that exist in both train and test sets
train_cols = set(train.columns)
test_cols = set(test.columns)
common_cols = train_cols.intersection(test_cols)
exclude_cols = {'ID', 'time', 'precipitation_mm', 'electricity_shutdown'}
feature_cols = [col for col in common_cols if col not in exclude_cols]
print(f"Number of features: {len(feature_cols)}")

X_train = train[feature_cols]
y_train_reg = train['precipitation_mm']
y_train_clf = train['electricity_shutdown'].astype(int)
X_test = test[feature_cols]

# Time-aware train/validation split
print("\n=== Time-Aware Train/Validation Split ===")
split_idx = int(0.8 * len(X_train))
X_train_split = X_train.iloc[:split_idx]
X_val = X_train.iloc[split_idx:]
y_reg_split = y_train_reg.iloc[:split_idx]
y_reg_val = y_train_reg.iloc[split_idx:]
y_clf_split = y_train_clf.iloc[:split_idx]
y_clf_val = y_train_clf.iloc[split_idx:]

print(f"Training split: {X_train_split.shape}")
print(f"Validation split: {X_val.shape}")
print(f"Class distribution in training: {np.bincount(y_clf_split)}")
print(f"Class distribution in validation: {np.bincount(y_clf_val)}")

# Create preprocessing pipeline
print("\n=== Creating Preprocessing Pipeline ===")
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Try different hyperparameters with focus on class imbalance
print("\n=== Hyperparameter Tuning ===")
best_score = float('inf')
best_params = None
best_threshold = 0.5

param_combinations = [
    {'max_iter': 500, 'learning_rate': 0.05, 'max_depth': None, 'min_samples_leaf': 20},
    {'max_iter': 300, 'learning_rate': 0.1, 'max_depth': 8, 'min_samples_leaf': 10},
    {'max_iter': 400, 'learning_rate': 0.03, 'max_depth': None, 'min_samples_leaf': 30},
    {'max_iter': 600, 'learning_rate': 0.02, 'max_depth': 10, 'min_samples_leaf': 25},
    {'max_iter': 200, 'learning_rate': 0.15, 'max_depth': 6, 'min_samples_leaf': 15},
]

for i, params in enumerate(param_combinations):
    print(f"Trying combination {i+1}/{len(param_combinations)}: {params}")
    
    # Create model with current parameters
    base_model = HistGradientBoostingRegressor(
        random_state=42,
        **params
    )
    
    model = MultiOutputRegressor(base_model)
    
    # Create pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Train and predict
    y_train_combined = np.column_stack([y_reg_split, y_clf_split])
    pipeline.fit(X_train_split, y_train_combined)
    
    val_preds = pipeline.predict(X_val)
    val_reg_pred = val_preds[:, 0]
    val_clf_proba = np.clip(val_preds[:, 1], 0.0, 1.0)
    
    # Find best threshold
    threshold, f1 = find_best_threshold(y_clf_val, val_clf_proba)
    
    # Calculate custom metric
    score, f1, rmse = custom_metric(y_reg_val, val_reg_pred, y_clf_val, val_clf_proba, threshold)
    
    print(f"  RMSE: {rmse:.4f}, F1: {f1:.4f}, Score: {score:.4f}, Threshold: {threshold:.3f}")
    
    if score < best_score:
        best_score = score
        best_params = params
        best_threshold = threshold
        best_pipeline = pipeline

print(f"\nBest parameters: {best_params}")
print(f"Best threshold: {best_threshold:.3f}")
print(f"Best score: {best_score:.4f}")

# Train final model on full training data
print("\n=== Training Final Model on Full Data ===")
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', MultiOutputRegressor(HistGradientBoostingRegressor(
        random_state=42,
        **best_params
    )))
])

y_train_combined = np.column_stack([y_train_reg, y_train_clf])
final_pipeline.fit(X_train, y_train_combined)

# Make predictions on test set
print("\n=== Making Test Predictions ===")
test_preds = final_pipeline.predict(X_test)
test_reg_pred = test_preds[:, 0]
test_clf_proba = np.clip(test_preds[:, 1], 0.0, 1.0)
test_clf_pred = (test_clf_proba >= best_threshold).astype(int)

print(f"Test predictions shape: {test_preds.shape}")
print(f"Precipitation range: {test_reg_pred.min():.3f} to {test_reg_pred.max():.3f}")
print(f"Electricity shutdown predictions: {np.bincount(test_clf_pred)}")
print(f"Electricity shutdown probabilities range: {test_clf_proba.min():.3f} to {test_clf_proba.max():.3f}")

# Save predictions
print("\n=== Saving Predictions ===")
submission = pd.read_csv("submission.csv")
submission['precipitation_mm'] = test_reg_pred
submission['electricity_shutdown'] = test_clf_pred
submission.to_csv("my_final_submission.csv", index=False)
print("Saved predictions to my_final_submission.csv")

# Show some statistics
print("\n=== Prediction Statistics ===")
print(f"Number of predicted electricity shutdowns: {test_clf_pred.sum()}")
print(f"Percentage of predicted shutdowns: {test_clf_pred.mean()*100:.2f}%")
print(f"Original training shutdown percentage: {y_train_clf.mean()*100:.2f}%")

print("\n=== Summary ===")
print("1. Added comprehensive time features including cyclical encoding")
print("2. Created lag and rolling features for important weather variables")
print("3. Used time-aware train/validation split")
print("4. Performed hyperparameter tuning with focus on class imbalance")
print("5. Found optimal threshold for binary classification")
print("6. Trained final model on full data")
print("7. Generated and saved predictions")
print("8. Handled class imbalance through threshold optimization")


In [None]:
#Step 7
"""
Complete Multi-Task Learning Pipeline for Kaggle Competition

This script demonstrates the complete pipeline for predicting:
1. Precipitation (regression)
2. Electricity shutdown (binary classification)

Custom metric: RMSE * (1 - F1)
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

def custom_metric(y_true_reg, y_pred_reg, y_true_clf, y_pred_clf_proba, threshold=0.5):
    """Calculate custom metric: RMSE * (1 - F1)"""
    y_pred_bin = (y_pred_clf_proba >= threshold).astype(int)
    f1 = f1_score(y_true_clf, y_pred_bin, zero_division=0)
    rmse = np.sqrt(np.mean((y_true_reg - y_pred_reg) ** 2))
    return rmse * (1 - f1), f1, rmse

def find_best_threshold(y_true, y_proba):
    """Find optimal threshold for binary classification"""
    best_thr, best_f1 = 0.5, -1.0
    for thr in np.linspace(0.0, 1.0, 101):
        y_pred = (y_proba >= thr).astype(int)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        if f1 > best_f1:
            best_f1, best_thr = f1, thr
    return best_thr, best_f1

def add_time_features(df):
    """Add comprehensive time features"""
    df = df.copy()
    df['time'] = pd.to_datetime(df['time'])
    
    # Basic time features
    df['hour'] = df['time'].dt.hour
    df['dayofweek'] = df['time'].dt.dayofweek
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['year'] = df['time'].dt.year
    
    # Cyclical features (better for ML)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    # Binary features
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_night'] = ((df['hour'] >= 22) | (df['hour'] <= 6)).astype(int)
    
    return df

def create_lag_features(df, cols, lags=[1, 3, 6]):
    """Create lag features for important columns"""
    df = df.copy()
    for col in cols:
        for lag in lags:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)
    return df

def create_rolling_features(df, cols, windows=[3, 6]):
    """Create rolling window features"""
    df = df.copy()
    for col in cols:
        for window in windows:
            df[f'{col}_rolling_mean_{window}'] = df[col].rolling(window=window, min_periods=1).mean()
            df[f'{col}_rolling_std_{window}'] = df[col].rolling(window=window, min_periods=1).std()
    return df

def main():
    print("=== Complete Multi-Task Learning Pipeline ===")
    
    # Step 1: Load Data
    print("\n1. Loading Data...")
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")
    submission = pd.read_csv("submission.csv")
    
    print(f"   Train shape: {train.shape}")
    print(f"   Test shape: {test.shape}")
    
    # Step 2: Feature Engineering
    print("\n2. Feature Engineering...")
    
    # Add time features
    train = add_time_features(train)
    test = add_time_features(test)
    
    # Create lag and rolling features for important weather variables
    important_cols = ['temp_2m_C', 'rel_humidity_2m_pct', 'pressure_msl_hPa', 'wind_speed_10m_kph']
    train = create_lag_features(train, important_cols)
    test = create_lag_features(test, important_cols)
    
    train = create_rolling_features(train, important_cols)
    test = create_rolling_features(test, important_cols)
    
    # Add lag features for precipitation only in train set
    train = create_lag_features(train, ['precipitation_mm'])
    train = create_rolling_features(train, ['precipitation_mm'])
    
    print(f"   Train features: {train.shape[1]}")
    print(f"   Test features: {test.shape[1]}")
    
    # Step 3: Prepare Features and Targets
    print("\n3. Preparing Features and Targets...")
    
    # Only use features that exist in both train and test sets
    train_cols = set(train.columns)
    test_cols = set(test.columns)
    common_cols = train_cols.intersection(test_cols)
    exclude_cols = {'ID', 'time', 'precipitation_mm', 'electricity_shutdown'}
    feature_cols = [col for col in common_cols if col not in exclude_cols]
    
    X_train = train[feature_cols]
    y_train_reg = train['precipitation_mm']
    y_train_clf = train['electricity_shutdown'].astype(int)
    X_test = test[feature_cols]
    
    print(f"   Number of features: {len(feature_cols)}")
    print(f"   Class distribution: {np.bincount(y_train_clf)}")
    
    # Step 4: Time-Aware Train/Validation Split
    print("\n4. Time-Aware Train/Validation Split...")
    split_idx = int(0.8 * len(X_train))
    X_train_split = X_train.iloc[:split_idx]
    X_val = X_train.iloc[split_idx:]
    y_reg_split = y_train_reg.iloc[:split_idx]
    y_reg_val = y_train_reg.iloc[split_idx:]
    y_clf_split = y_train_clf.iloc[:split_idx]
    y_clf_val = y_train_clf.iloc[split_idx:]
    
    print(f"   Training split: {X_train_split.shape}")
    print(f"   Validation split: {X_val.shape}")
    
    # Step 5: Hyperparameter Tuning
    print("\n5. Hyperparameter Tuning...")
    
    preprocessor = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    best_score = float('inf')
    best_params = None
    best_threshold = 0.5
    
    param_combinations = [
        {'max_iter': 500, 'learning_rate': 0.05, 'max_depth': None, 'min_samples_leaf': 20},
        {'max_iter': 300, 'learning_rate': 0.1, 'max_depth': 8, 'min_samples_leaf': 10},
        {'max_iter': 400, 'learning_rate': 0.03, 'max_depth': None, 'min_samples_leaf': 30},
        {'max_iter': 600, 'learning_rate': 0.02, 'max_depth': 10, 'min_samples_leaf': 25},
        {'max_iter': 200, 'learning_rate': 0.15, 'max_depth': 6, 'min_samples_leaf': 15},
    ]
    
    for i, params in enumerate(param_combinations):
        print(f"   Trying combination {i+1}/{len(param_combinations)}...")
        
        base_model = HistGradientBoostingRegressor(random_state=42, **params)
        model = MultiOutputRegressor(base_model)
        
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        
        y_train_combined = np.column_stack([y_reg_split, y_clf_split])
        pipeline.fit(X_train_split, y_train_combined)
        
        val_preds = pipeline.predict(X_val)
        val_reg_pred = val_preds[:, 0]
        val_clf_proba = np.clip(val_preds[:, 1], 0.0, 1.0)
        
        threshold, f1 = find_best_threshold(y_clf_val, val_clf_proba)
        score, f1, rmse = custom_metric(y_reg_val, val_reg_pred, y_clf_val, val_clf_proba, threshold)
        
        print(f"     RMSE: {rmse:.4f}, F1: {f1:.4f}, Score: {score:.4f}, Threshold: {threshold:.3f}")
        
        if score < best_score:
            best_score = score
            best_params = params
            best_threshold = threshold
    
    print(f"\n   Best parameters: {best_params}")
    print(f"   Best threshold: {best_threshold:.3f}")
    print(f"   Best score: {best_score:.4f}")
    
    # Step 6: Train Final Model
    print("\n6. Training Final Model...")
    
    final_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', MultiOutputRegressor(HistGradientBoostingRegressor(
            random_state=42,
            **best_params
        )))
    ])
    
    y_train_combined = np.column_stack([y_train_reg, y_train_clf])
    final_pipeline.fit(X_train, y_train_combined)
    
    # Step 7: Make Predictions
    print("\n7. Making Predictions...")
    
    test_preds = final_pipeline.predict(X_test)
    test_reg_pred = test_preds[:, 0]
    test_clf_proba = np.clip(test_preds[:, 1], 0.0, 1.0)
    test_clf_pred = (test_clf_proba >= best_threshold).astype(int)
    
    print(f"   Test predictions shape: {test_preds.shape}")
    print(f"   Precipitation range: {test_reg_pred.min():.3f} to {test_reg_pred.max():.3f}")
    print(f"   Electricity shutdown predictions: {np.bincount(test_clf_pred)}")
    
    # Step 8: Save Results
    print("\n8. Saving Results...")
    
    submission['precipitation_mm'] = test_reg_pred
    submission['electricity_shutdown'] = test_clf_pred
    submission.to_csv("final_submission.csv", index=False)
    
    print("   Saved predictions to final_submission.csv")
    
    # Step 9: Summary Statistics
    print("\n9. Summary Statistics...")
    print(f"   Number of predicted electricity shutdowns: {test_clf_pred.sum()}")
    print(f"   Percentage of predicted shutdowns: {test_clf_pred.mean()*100:.2f}%")
    print(f"   Original training shutdown percentage: {y_train_clf.mean()*100:.2f}%")
    
    print("\n=== Pipeline Complete! ===")
    print("Key Learnings:")
    print("1. Time series data requires time-aware splitting")
    print("2. Cyclical encoding of time features improves performance")
    print("3. Lag and rolling features capture temporal patterns")
    print("4. Class imbalance requires threshold optimization")
    print("5. Multi-output models can handle both tasks simultaneously")
    print("6. Custom metrics guide hyperparameter selection")

if __name__ == "__main__":
    main()
