In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# --- Import libraries for modeling ---
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import AUC

# --- Load datasets ---
df_train = pd.read_csv('/kaggle/input/playground-series-s5e3/train.csv')
df_test  = pd.read_csv('/kaggle/input/playground-series-s5e3/test.csv')
df_sub = pd.read_csv('/kaggle/input/playground-series-s5e3/sample_submission.csv')

# Drop identifier column
df_train.drop(columns=['id'], inplace=True)
df_test.drop(columns=['id'], inplace=True)

# Fill missing values in test data (if any)
if df_test['winddirection'].isnull().sum() > 0:
    df_test['winddirection'].fillna(df_test['winddirection'].median(), inplace=True)

# --- Merge External Data (if applicable) ---
df_ext = pd.read_csv('/kaggle/input/rainfall-prediction-using-machine-learning/Rainfall.csv')
df_ext = df_ext.rename(columns={'pressure ': 'pressure', 
                                'humidity ': 'humidity', 
                                'cloud ': 'cloud',
                                'winddirection': 'winddirection'})
df_train = pd.concat([df_train, df_ext], axis=0, ignore_index=True)
df_train = df_train.replace({'yes': 1, 'no': 0})

# --- Feature Engineering ---
def create_weather_features(df):    
    df["temp_Range"] = df["maxtemp"] - df["mintemp"]
    df["Heat_Index"] = df["temparature"] + 0.5 * (df["temparature"] - 10) * (df["humidity"] / 100)    
    df['csr'] = df['cloud'] / (df['sunshine'] + 1e-5)
    df["Dew_Dep"] = df["temparature"] - df["dewpoint"]    
    df["Wind_Chill"] = np.where(
        df["temparature"] < 10,
        13.12 + 0.6215 * df["temparature"] - 11.37 * df["windspeed"]**0.16 + 
        0.3965 * df["temparature"] * df["windspeed"]**0.16,
        df["temparature"]
    )   
    df['hsi'] = df['humidity'] * df['sunshine']
    df["Cloud_Sun_Ratio"] = df["cloud"] / (df["sunshine"] + 1)    
    df["Pressure_Change"] = df["pressure"].diff().fillna(0)    
    air_density = 1.225  
    df['wi'] = (0.4 * df['humidity']) + (0.3 * df['cloud']) - (0.3 * df['sunshine'])
    df["Wind_Power"] = 0.5 * air_density * df["windspeed"]**3         
    df['cloud + humidity'] = df['cloud'] + df['humidity']
    df['cloud + humidity + sunshine'] = df['cloud'] + df['humidity'] + df['sunshine']
    df['sp'] = df['sunshine'] / (df['sunshine'] + df['cloud'] + 1e-5)
    df['cloud * sunshine'] = df['cloud'] * df['sunshine']
    df['humidity * sunshine'] = df['humidity'] * df['sunshine']
    df['rd'] = 100 - df['humidity']
    df["HTI"] = df["temparature"] + 0.2 * df["humidity"]    
    df["ACI"] = (df["maxtemp"] + df["mintemp"]) / 2 - (df["humidity"] / 2) + df["dewpoint"]    
    df["CSI"] = (df["sunshine"] - df["cloud"]) / (df["sunshine"] + df["cloud"] + 1)
    df['hci'] = df['humidity'] * df['cloud']
    df["WCI"] = df["temparature"] - 0.5 * df["windspeed"]
    # Convert day column to datetime; later it is dropped.
    df['day'] = pd.to_datetime(df['day'], errors='coerce')    
    return df

df_train = create_weather_features(df_train)
df_test = create_weather_features(df_test)

# Fill missing values using the median (only numeric columns)
df_train.fillna(df_train.median(numeric_only=True), inplace=True)
df_test.fillna(df_test.median(numeric_only=True), inplace=True)

# Drop unneeded columns ('day' and any extra misnamed columns)
if 'day' in df_train.columns:
    df_train.drop(columns=['day'], inplace=True)
if '         winddirection' in df_train.columns:
    df_train.drop(columns=['         winddirection'], inplace=True)
if 'day' in df_test.columns:
    df_test.drop(columns=['day'], inplace=True)

# Separate target and features
y = df_train['rainfall']
df_train.drop(columns=['rainfall'], inplace=True)
X = df_train.copy()

# --- Deep Learning Model (Conv1D) ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(df_test)

# Split data for deep learning
X_train_dl, X_val_dl, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
# Reshape for Conv1D: (samples, features, 1)
X_train_dl = X_train_dl.reshape((X_train_dl.shape[0], X_train_dl.shape[1], 1))
X_val_dl = X_val_dl.reshape((X_val_dl.shape[0], X_val_dl.shape[1], 1))
X_test_dl = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# Build the Conv1D model
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_dl.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    Conv1D(filters=32, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[AUC(name='auc')])
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=1e-5, verbose=1)

history = model.fit(
    X_train_dl, y_train,
    epochs=200,
    batch_size=32,
    validation_data=(X_val_dl, y_val),
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Get predictions from the deep learning model on test set
test_preds_dl = model.predict(X_test_dl).flatten()

# --- LightGBM Model using StratifiedKFold ---
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Arrays to store out-of-fold predictions and test predictions
lgbm_oof_preds = np.zeros(len(X))
lgbm_test_preds = np.zeros(len(df_test))

lgb_params = {
    'max_depth': 11, 
    'num_leaves': 152, 
    'learning_rate': 0.1284552398987031,
    'feature_fraction': 0.699205159877181965, 
    'bagging_fraction': 0.964565149953761379, 
    'bagging_freq': 3, 
    'min_child_samples': 93, 
    'lambda_l1': 1.4313376954414664e-09, 
    'lambda_l2': 5.9890062038048195, 
    'min_gain_to_split': 0.5655566115584013
}

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train_lgb = X.iloc[train_idx]
    X_val_lgb = X.iloc[val_idx]
    y_train_lgb = y.iloc[train_idx]
    y_val_lgb = y.iloc[val_idx]
    
    lgbm_model = LGBMClassifier(**lgb_params)
    lgbm_model.fit(X_train_lgb, y_train_lgb,
                   eval_set=[(X_val_lgb, y_val_lgb)],
                   eval_metric='auc',
                   verbose=False)
    
    lgbm_oof_preds[val_idx] = lgbm_model.predict_proba(X_val_lgb)[:, 1]
    lgbm_test_preds += lgbm_model.predict_proba(df_test)[:, 1] / n_splits

overall_auc_lgbm = roc_auc_score(y, lgbm_oof_preds)
print("Overall AUC (LightGBM):", overall_auc_lgbm)

# --- CatBoost Model using StratifiedKFold ---
# First, label encode any categorical features (if they exist)
categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

cat_features = [X.columns.get_loc(col) for col in categorical_columns]

acc_list = []
f1_list = []
roc_auc_list = []
skf_cat = StratifiedKFold(n_splits=5, shuffle=True, random_state=41)

for fold, (train_idx, val_idx) in enumerate(skf_cat.split(X, y), 1):
    X_train_cat = X.iloc[train_idx]
    X_val_cat = X.iloc[val_idx]
    y_train_cat = y.iloc[train_idx]
    y_val_cat = y.iloc[val_idx]
    
    cat_model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.1,
        depth=6,
        cat_features=cat_features,
        verbose=200,
        early_stopping_rounds=10
    )
    
    cat_model.fit(X_train_cat, y_train_cat, eval_set=(X_val_cat, y_val_cat),
                  early_stopping_rounds=10, verbose=200)
    
    y_pred_cat = cat_model.predict(X_val_cat)
    y_pred_proba_cat = cat_model.predict_proba(X_val_cat)[:, 1]
    acc = accuracy_score(y_val_cat, y_pred_cat)
    f1 = f1_score(y_val_cat, y_pred_cat, average="weighted")
    roc_auc = roc_auc_score(y_val_cat, y_pred_proba_cat)
    acc_list.append(acc)
    f1_list.append(f1)
    roc_auc_list.append(roc_auc)
    print(f"CatBoost Fold {fold} - Accuracy: {acc:.4f}, F1-Score: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")

print("CatBoost CV - Mean Accuracy:", np.mean(acc_list))
print("CatBoost CV - Mean F1-Score:", np.mean(f1_list))
print("CatBoost CV - Mean ROC-AUC:", np.mean(roc_auc_list))

cat_test_preds = cat_model.predict_proba(df_test)[:, 1]

# --- Final Ensemble and Submission ---
# Combine predictions from LightGBM, CatBoost, and the Deep Learning model.
# (Weights here are arbitrary and can be tuned)
ensemble_preds = (lgbm_test_preds + cat_test_preds) * 0.5 + test_preds_dl

# Ensure final predictions are in the range [0, 1]
ensemble_preds = np.clip(ensemble_preds, 0, 1)

df_sub['rainfall'] = ensemble_preds
df_sub.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")

In [None]:
# ============================================================
# 1. IMPORTS & SETUP
# ============================================================

import numpy as np
import pandas as pd
import gc
import optuna
import time

from copy import deepcopy
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Suppress warnings
import warnings
warnings.simplefilter("ignore")

# ============================================================
# 2. DATA LOADING & INITIAL PROCESSING
# ============================================================

# Read competition data and an external "original" dataset; fix misspellings.
train = pd.read_csv('/kaggle/input/playground-series-s5e3/train.csv').rename(columns={'temparature': 'temperature'})
test = pd.read_csv('/kaggle/input/playground-series-s5e3/test.csv').rename(columns={'temparature': 'temperature'})
original = pd.read_csv("/kaggle/input/rainfall-prediction-using-machine-learning/Rainfall.csv").rename(columns={'temparature': 'temperature'})
submission = pd.read_csv("/kaggle/input/playground-series-s5e3/sample_submission.csv")

# Drop identifier columns
if 'id' in train.columns:
    train.drop(columns=['id'], inplace=True)
if 'id' in test.columns:
    test.drop(columns=['id'], inplace=True)

# Map external target values from strings to binary numbers and strip whitespace from column names.
original['rainfall'] = original['rainfall'].map({'yes': 1, 'no': 0})
original.columns = [col.strip() for col in original.columns]

# Mark the data source
train["original"] = 0
test["original"] = 0
original["original"] = 1

# Concatenate external data to training data
train = pd.concat([train, original], axis=0).reset_index(drop=True)
target = 'rainfall'

# ============================================================
# 3. PREPROCESSING FUNCTIONS
# ============================================================

def fix_day_sequence_with_month_year(df, day_column='day', cycle_length=365):
    """
    Corrects the 'day' column sequence and creates 'month' and 'year' columns.
    """
    df_fixed = df.copy()
    days = df_fixed[day_column].values
    fixed_days = np.zeros_like(days)
    months = np.zeros_like(days)
    years = np.ones_like(days)  # start with year 1
    
    # Days per month for a non-leap year
    days_per_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    
    current_day = 1
    current_month = 1
    current_year = 1
    
    fixed_days[0] = current_day
    months[0] = current_month
    years[0] = current_year
    
    for i in range(1, len(days)):
        expected_next = current_day + 1 if current_day < cycle_length else 1
        current_value = days[i]
        if (current_value == expected_next) or (abs(current_value - expected_next) <= 5 and current_value <= cycle_length) or (current_day == cycle_length and current_value == 1):
            current_day = current_value
        else:
            current_day = expected_next
        
        if current_day == 1 and i > 0:
            current_year += 1
        
        day_of_year = current_day if current_day != 1 else 1
        cumulative = 0
        for month, days_in_month in enumerate(days_per_month, 1):
            if day_of_year <= cumulative + days_in_month:
                current_month = month
                break
            cumulative += days_in_month
        
        fixed_days[i] = current_day
        months[i] = current_month
        years[i] = current_year
        
    df_fixed[day_column] = fixed_days
    df_fixed['month'] = months
    df_fixed['year'] = years
    return df_fixed

def handle_missing_values(train_df, test_df, target="rainfall", n_components=1):
    """
    Imputes missing numeric values using the median (fitted on train) and adds indicator columns for missingness.
    If multiple indicators exist, applies Truncated SVD to reduce dimensionality.
    """
    # Use only common features (excluding the target)
    common_features = [col for col in train_df.columns if col in test_df.columns and col != target]
    train_subset = train_df[common_features]
    test_subset = test_df[common_features]
    
    imp = SimpleImputer(strategy='median')
    imp.fit(train_subset)
    train_imputed_values = imp.transform(train_subset)
    test_imputed_values = imp.transform(test_subset)
    
    train_imputed = pd.DataFrame(train_imputed_values, columns=common_features, index=train_df.index)
    test_imputed = pd.DataFrame(test_imputed_values, columns=common_features, index=test_df.index)
    
    # Bring back non-common features from train (if any)
    non_common = [col for col in train_df.columns if col not in common_features and col != target]
    for col in non_common:
        train_imputed[col] = train_df[col]
    
    # Create missing indicators
    indicator_cols = []
    for col in common_features:
        indicator = f'{col}_missing'
        train_imputed[indicator] = train_df[col].isna().astype(int)
        test_imputed[indicator] = test_df[col].isna().astype(int)
        indicator_cols.append(indicator)
    
    # Optionally reduce indicator dimensions using SVD if more than one exists
    if len(indicator_cols) > 1:
        from sklearn.decomposition import TruncatedSVD
        svd = TruncatedSVD(n_components=min(n_components, len(indicator_cols)))
        indicators_train = train_imputed[indicator_cols].values
        indicators_test = test_imputed[indicator_cols].values
        if np.any(indicators_train):
            svd_train = svd.fit_transform(indicators_train)
            svd_test = svd.transform(indicators_test)
            for i in range(n_components):
                train_imputed[f'missing_svd_{i}'] = svd_train[:, i]
                test_imputed[f'missing_svd_{i}'] = svd_test[:, i]
            train_imputed.drop(columns=indicator_cols, inplace=True)
            test_imputed.drop(columns=indicator_cols, inplace=True)
    
    # Add back target to training data
    if target in train_df.columns:
        train_imputed[target] = train_df[target]
    return train_imputed, test_imputed

def engineer_features(df):
    """
    Creates derived features based on meteorological domain knowledge.
    """
    df_new = df.copy()
    # 1. Temperature range
    df_new['temp_range'] = df_new['maxtemp'] - df_new['mintemp']
    # 2. Dewpoint depression
    df_new['dewpoint_depression'] = df_new['temperature'] - df_new['dewpoint']
    # 3. Pressure change (first difference)
    df_new['pressure_change'] = df_new['pressure'].diff().fillna(0)
    # 4. Humidity-to-dewpoint ratio
    df_new['humidity_dewpoint_ratio'] = df_new['humidity'] / df_new['dewpoint'].clip(lower=0.1)
    # 5. Cloud-to-sunshine ratio
    df_new['cloud_sunshine_ratio'] = df_new['cloud'] / df_new['sunshine'].clip(lower=0.1)
    # 6. Wind-humidity factor
    df_new['wind_humidity_factor'] = df_new['windspeed'] * (df_new['humidity'] / 100)
    # 7. Temperature-humidity index
    df_new['temp_humidity_index'] = (0.8 * df_new['temperature']) + ((df_new['humidity'] / 100) * (df_new['temperature'] - 14.3)) + 46.4
    # 8. Pressure acceleration (second difference)
    df_new['pressure_acceleration'] = df_new['pressure_change'].diff().fillna(0)
    # 9. Cyclical transformation for day-of-year
    df_new['day_of_year_sin'] = np.sin(2 * np.pi * df_new['day'] / 365)
    df_new['day_of_year_cos'] = np.cos(2 * np.pi * df_new['day'] / 365)
    # 10. Rolling averages (3, 7, 14-day windows)
    for window in [3, 7, 14]:
        df_new[f'temperature_rolling_{window}d'] = df_new['temperature'].rolling(window=window, min_periods=1).mean()
        df_new[f'pressure_rolling_{window}d'] = df_new['pressure'].rolling(window=window, min_periods=1).mean()
        df_new[f'humidity_rolling_{window}d'] = df_new['humidity'].rolling(window=window, min_periods=1).mean()
        df_new[f'cloud_rolling_{window}d'] = df_new['cloud'].rolling(window=window, min_periods=1).mean()
        df_new[f'windspeed_rolling_{window}d'] = df_new['windspeed'].rolling(window=window, min_periods=1).mean()
    # 11. 3-day trend features
    df_new['temp_trend_3d'] = df_new['temperature'].diff(3).fillna(0)
    df_new['pressure_trend_3d'] = df_new['pressure'].diff(3).fillna(0)
    df_new['humidity_trend_3d'] = df_new['humidity'].diff(3).fillna(0)
    # 12. Extreme weather indicators
    df_new['extreme_temp'] = ((df_new['temperature'] > df_new['temperature'].quantile(0.95)) |
                              (df_new['temperature'] < df_new['temperature'].quantile(0.05))).astype(int)
    df_new['extreme_humidity'] = ((df_new['humidity'] > df_new['humidity'].quantile(0.95)) |
                                  (df_new['humidity'] < df_new['humidity'].quantile(0.05))).astype(int)
    df_new['extreme_pressure'] = ((df_new['pressure'] > df_new['pressure'].quantile(0.95)) |
                                  (df_new['pressure'] < df_new['pressure'].quantile(0.05))).astype(int)
    # 13. Interaction features
    df_new['temp_humidity_interaction'] = df_new['temperature'] * df_new['humidity']
    df_new['pressure_wind_interaction'] = df_new['pressure'] * df_new['windspeed']
    df_new['cloud_sunshine_interaction'] = df_new['cloud'] * df_new['sunshine']
    df_new['dewpoint_humidity_interaction'] = df_new['dewpoint'] * df_new['humidity']
    # 14. Rolling standard deviations (7 & 14 days)
    for window in [7, 14]:
        df_new[f'temp_std_{window}d'] = df_new['temperature'].rolling(window=window, min_periods=4).std().fillna(0)
        df_new[f'pressure_std_{window}d'] = df_new['pressure'].rolling(window=window, min_periods=4).std().fillna(0)
        df_new[f'humidity_std_{window}d'] = df_new['humidity'].rolling(window=window, min_periods=4).std().fillna(0)
    # 15. Seasonal feature (using already computed 'month')
    df_new['season'] = ((df_new['month'] - 1) // 3) + 1
    return df_new

# ============================================================
# 4. APPLY PREPROCESSING & FEATURE ENGINEERING
# ============================================================

# Fix day sequence (adds month and year)
train = fix_day_sequence_with_month_year(train)
test = fix_day_sequence_with_month_year(test)

# Impute missing values and add missing indicators
train_imputed, test_imputed = handle_missing_values(train, test, target=target, n_components=1)

# (Optional: One Hot Encode categorical features if needed using OHE function)

# Create new derived features
train_fe = engineer_features(train_imputed)
test_fe = engineer_features(test_imputed)

# ============================================================
# 5. FEATURE SELECTION & SCALING
# ============================================================

# Separate target; get all features except target
final_features = [f for f in train_fe.columns if f != target]
final_features = list(set(final_features))

scaler = StandardScaler()
train_scaled = train_fe.copy()
test_scaled = test_fe.copy()
train_scaled[final_features] = scaler.fit_transform(train_fe[final_features])
test_scaled[final_features] = scaler.transform(test_fe[final_features])

# Remove duplicate (redundant) features
def post_processor(train_df, test_df, target):
    cols = train_df.drop(columns=[target]).columns
    train_copy = train_df.copy()
    test_copy = test_df.copy()
    drop_cols = []
    for i, feature in enumerate(cols):
        for j in range(i+1, len(cols)):
            if np.sum(np.abs(train_copy[feature] - train_copy[cols[j]])) == 0:
                if cols[j] not in drop_cols:
                    drop_cols.append(cols[j])
    train_copy.drop(columns=drop_cols, inplace=True)
    test_copy.drop(columns=drop_cols, inplace=True)
    return train_copy, test_copy

train_cop, test_cop = post_processor(train_scaled, test_scaled, target)
X_train = train_cop.drop(columns=[target])
y_train = train_cop[target]
X_test = test_cop.copy()

# (Optional: Use a feature importance method to select top features)
def get_most_important_features(X_train, y_train, n, model_input):
    # Here we use XGBoost as an example
    xgb_params = {
        'n_jobs': -1,
        'eval_metric': 'logloss',
        'objective': 'binary:logistic',
        'tree_method': 'hist',
        'verbosity': 0,
        'random_state': 42,
    }
    model = xgb.XGBClassifier(**xgb_params)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []
    importances = []
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        model.fit(X_tr, y_tr, verbose=False)
        y_pred = model.predict_proba(X_val)[:, 1]
        auc_scores.append(roc_auc_score(y_val, y_pred))
        importances.append(model.feature_importances_)
    avg_importance = np.mean(importances, axis=0)
    features = [(X_train.columns[i], avg_importance[i]) for i in range(len(X_train.columns))]
    sorted_features = sorted(features, key=lambda x: x[1], reverse=True)
    top_n = [f[0] for f in sorted_features[:n]]
    return top_n

# For example, select top 50 features from XGBoost importance:
top_features = get_most_important_features(X_train.reset_index(drop=True), y_train, 50, 'xgb')
X_train = X_train[top_features]
X_test = X_test[top_features]

# ============================================================
# 6. MODELING & ENSEMBLE
# ============================================================

# Calculate class weights (useful for imbalanced classification)
classes = np.unique(y_train)
class_to_index = {cls: idx for idx, cls in enumerate(classes)}
y_train_numeric = np.array([class_to_index[cls] for cls in y_train])
class_counts = np.bincount(y_train_numeric)
total_samples = len(y_train_numeric)
class_weights = total_samples / (len(classes) * class_counts)
class_weights_dict = {cls: weight for cls, weight in zip(classes, class_weights)}
print("Class weights:", class_weights_dict)

# Define a small set of models to use in the ensemble.
models = {
    'xgb': xgb.XGBClassifier(n_jobs=-1, eval_metric='logloss', objective='binary:logistic',
                             tree_method='hist', random_state=42),
    'knn': KNeighborsClassifier(n_neighbors=101, p=1),
    'svm': SVC(probability=True, random_state=42)
}

# Define an Optuna-based ensemble weight optimizer
class OptunaWeights:
    def __init__(self, n_trials=300, metric='auc', random_state=42):
        self.n_trials = n_trials
        self.metric = metric
        self.random_state = random_state
        self.weights = None
        self.study = None
        self.calibrated_threshold = 0.5

    def fit(self, y_true, y_preds):
        # y_preds is a list of prediction arrays
        def objective(trial):
            ws = [trial.suggest_uniform(f'w{i}', -1, 1) for i in range(len(y_preds))]
            weighted = np.average(np.array(y_preds), axis=0, weights=ws)
            score = roc_auc_score(y_true, weighted)
            return score
        self.study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=self.random_state))
        self.study.optimize(objective, n_trials=self.n_trials, show_progress_bar=False)
        self.weights = [self.study.best_trial.params[f'w{i}'] for i in range(len(y_preds))]
        self.best_score = self.study.best_value

    def predict(self, y_preds, normalize=True):
        y_preds_np = [np.array(pred) for pred in y_preds]
        weighted_pred = np.average(np.array(y_preds_np), axis=0, weights=self.weights)
        if normalize:
            weighted_pred = np.clip(weighted_pred, 0, 1)
        return weighted_pred

    def fit_predict(self, y_true, y_preds, normalize=True):
        self.fit(y_true, y_preds)
        return self.predict(y_preds, normalize=normalize)

# Cross-validation with ensemble weighting
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_train))
test_preds = np.zeros(len(X_test))
ensemble_scores = []
weights_list = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train)):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    
    fold_val_preds = {}
    fold_test_preds = {}
    # Train each model and obtain predictions
    for name, model in models.items():
        model.fit(X_tr, y_tr)
        y_val_pred = model.predict_proba(X_val)[:, 1]
        y_test_pred = model.predict_proba(X_test)[:, 1]
        fold_val_preds[name] = y_val_pred
        fold_test_preds[name] = y_test_pred

        print(f'{name} [Fold {fold}] AUC: {roc_auc_score(y_val, y_val_pred):.5f}')

    # Optimize ensemble weights on validation fold predictions
    optw = OptunaWeights(n_trials=300, random_state=42, metric='auc')
    ensemble_val_pred = optw.fit_predict(y_val, list(fold_val_preds.values()))
    oof_preds[val_idx] = ensemble_val_pred
    fold_auc = roc_auc_score(y_val, ensemble_val_pred)
    ensemble_scores.append(fold_auc)
    weights_list.append(optw.weights)

    # Get ensemble prediction for test set (average over folds)
    model_test_preds = list(fold_test_preds.values())
    ensemble_test_pred = optw.predict(model_test_preds)
    test_preds += ensemble_test_pred / kf.n_splits

    print(f'Ensemble [Fold {fold}] AUC: {fold_auc:.5f}')

print("Overall Ensemble CV AUC:", np.mean(ensemble_scores))
print("Optimized Ensemble Weights (per fold):", weights_list)

# ============================================================
# 7. FINAL SUBMISSION
# ============================================================

submission[target] = test_preds
submission.to_csv('submission_pure.csv', index=True)

# Optionally, save out-of-fold predictions for future stacking/analysis.
oof_df = pd.DataFrame(oof_preds, index=X_train.index, columns=[target])
oof_df.to_csv('oof_preds.csv')

In [None]:
# =============================================================================
# 1. SETUP & IMPORTS
# =============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from prettytable import PrettyTable

# For reproducibility
np.random.seed(42)

# =============================================================================
# 2. DATA LOADING
# =============================================================================
# Load the competition training data and external dataset ("original")
train = pd.read_csv('/kaggle/input/playground-series-s5e3/train.csv', index_col='id')
original = pd.read_csv('/kaggle/input/rainfall-prediction-using-machine-learning/Rainfall.csv')

# Clean external dataset: strip whitespace from column names and convert rainfall
original.columns = [col.strip() for col in original.columns]
original['rainfall'] = original['rainfall'].map({'yes': 1, 'no': 0})

# Load sample submission (our own submission to be blended later)
submission = pd.read_csv('/kaggle/input/playground-series-s5e3/sample_submission.csv')

# =============================================================================
# 3. EXPLORATORY DATA ANALYSIS (EDA)
# =============================================================================
# --- 3.1 Target Distribution: Pie Charts ---

def plot_rainfall_distribution(data, title, ax, colors=None, shadow=True, startangle=90):
    """
    Create an enhanced pie chart showing the distribution of the target variable 'rainfall'.
    """
    target = 'rainfall'
    data_counts = data[target].value_counts().sort_index()
    labels = ['No Rain (0)', 'Rain (1)']
    sizes = [data_counts.get(0, 0), data_counts.get(1, 0)]
    
    # Set default colors if not provided
    if colors is None:
        colors = ['#3498db', '#e74c3c']  # Blue for no rain, Red for rain
    
    # Slightly "explode" the rain slice to emphasize
    explode = (0, 0.1)
    
    wedges, texts, autotexts = ax.pie(
        sizes,
        explode=explode,
        labels=labels,
        colors=colors,
        autopct='%1.1f%%',
        shadow=shadow,
        startangle=startangle,
        wedgeprops={'edgecolor': 'w', 'linewidth': 1}
    )
    
    # Customize text appearance
    for text in texts:
        text.set_fontsize(12)
        text.set_fontweight('bold')
    for autotext in autotexts:
        autotext.set_fontsize(11)
        autotext.set_fontweight('bold')
        autotext.set_color('white')
        
    # Add total sample info
    ax.text(0, -1.2, f"Total samples: {len(data)}", ha='center', fontsize=10,
            bbox=dict(boxstyle="round,pad=0.3", fc='#f0f0f0', ec='gray'))
    ax.axis('equal')
    ax.set_title(title, fontsize=14, fontweight='bold', pad=20)

# Create subplots for the train and external datasets
fig, axes = plt.subplots(1, 2, figsize=(18, 8))
custom_colors = [['#3498db', '#e74c3c'], ['#2ecc71', '#9b59b6']]
plot_rainfall_distribution(train, "Train Dataset: Rainfall Distribution", axes[0], colors=custom_colors[0])
plot_rainfall_distribution(original, "Original Dataset: Rainfall Distribution", axes[1], colors=custom_colors[1])
fig.suptitle('Comparison of Rainfall Distribution Across Datasets', fontsize=16, fontweight='bold', y=0.98)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# --- 3.2 Numerical Feature Distributions ---
# Identify continuous columns (non-object and with more than 2 unique values)
cont_cols = [col for col in train.columns if train[col].dtype != 'O' and train[col].nunique() > 2]
n_rows = len(cont_cols)

# Create a subplot grid with two columns (Train and Original) per feature
fig, axs = plt.subplots(n_rows, 2, figsize=(14, 4.5 * n_rows), dpi=100)
if n_rows == 1:
    axs = np.array([axs])  # ensure 2D array

for i, col in enumerate(cont_cols):
    # Violin plot for train dataset
    sns.violinplot(x='rainfall', y=col, data=train, ax=axs[i, 0],
                   palette=custom_colors, inner='quartile', linewidth=1.5, cut=0)
    axs[i, 0].set_title(f'{col.title()} Distribution by Rainfall (Train)', fontsize=14, fontweight='bold', pad=10)
    axs[i, 0].set_xlabel('Rainfall', fontsize=12, labelpad=10)
    axs[i, 0].set_ylabel(col.title(), fontsize=12, labelpad=10)
    axs[i, 0].set_xticklabels(['No Rain (0)', 'Rain (1)'])
    sns.despine(ax=axs[i, 0])
    
    # Violin plot for original dataset
    sns.violinplot(x='rainfall', y=col, data=original, ax=axs[i, 1],
                   palette=custom_colors, inner='quartile', linewidth=1.5, cut=0)
    axs[i, 1].set_title(f'{col.title()} Distribution by Rainfall (Original)', fontsize=14, fontweight='bold', pad=10)
    axs[i, 1].set_xlabel('Rainfall', fontsize=12, labelpad=10)
    axs[i, 1].set_ylabel(col.title(), fontsize=12, labelpad=10)
    axs[i, 1].set_xticklabels(['No Rain (0)', 'Rain (1)'])
    sns.despine(ax=axs[i, 1])
    
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

# =============================================================================
# 4. EXTERNAL BLEND OF SUBMISSIONS
# =============================================================================
# The following function computes ensemble predictions using Arithmetic, Geometric, or Harmonic Mean.

def ensemble_mean(sub_list, cols, mean="AM"):
    """
    Computes the mean (Arithmetic, Geometric, or Harmonic) over a list of submission dataframes.
    
    Parameters:
        sub_list (list of pd.DataFrame): List of submission dataframes with same target columns.
        cols (list): List of target column names.
        mean (str): "AM" for arithmetic mean, "GM" for geometric mean, "HM" for harmonic mean.
    
    Returns:
        pd.DataFrame: A blended submission dataframe.
    """
    sub_out = sub_list[0].copy()
    if mean == "AM":
        for col in cols:
            sub_out[col] = sum(df[col] for df in sub_list) / len(sub_list)
    elif mean == "GM":
        for df in sub_list[1:]:
            for col in cols:
                sub_out[col] *= df[col]
        for col in cols:
            sub_out[col] = sub_out[col] ** (1 / len(sub_list))
    elif mean == "HM":
        for col in cols:
            sub_out[col] = len(sub_list) / sum(1 / df[col] for df in sub_list)
    return sub_out

# Read external submission files
sub_ext1 = pd.read_csv("/kaggle/input/ps3e5-ensemble-ancillary/sub_exxt_vyacheslavbolotin_v114_95655.csv")
sub_ext2 = pd.read_csv("/kaggle/input/ps3e5-ensemble-ancillary/sub_ext_cdeotte_v1_95628.csv")
sub_ext3 = pd.read_csv("/kaggle/input/ps3e5-ensemble-ancillary/sub_ext_act18l_v17_91522.csv")
sub_ext4 = pd.read_csv("/kaggle/input/ps3e5-ensemble-ancillary/sub_ext_itasps_v23_91499.csv")
sub_ext5 = pd.read_csv("/kaggle/input/ps3e5-ensemble-ancillary/sub_ext_act18l_v9_91043.csv")
sub_ext6 = pd.read_csv("/kaggle/input/ps3e5-ensemble-ancillary/sub_ext_act18l_v9_90935.csv")
sub_ext7 = pd.read_csv("/kaggle/input/ps3e5-ensemble-ancillary/sub_ext_vyacheslavbolotin_v28_90581.csv")
sub_ext8 = pd.read_csv("/kaggle/input/ps3e5-ensemble-ancillary/sub_ext_vyacheslavbolotin_v20_89702.csv")
sub_ext9 = pd.read_csv("/kaggle/input/ps3e5-ensemble-ancillary/sub_ext_vyacheslavbolotin_v53_89568.csv")
sub_ext10 = pd.read_csv("/kaggle/input/ps3e5-ensemble-ancillary/sub_ext_samanyuk_v6_89541.csv")

# Combine external submissions with our own submission.
# (Adjust the list as needed. Here we include a total of 7 submissions.)
sub_list = [sub_ext1, sub_ext2, sub_ext3, sub_ext4, sub_ext7, sub_ext10, submission]

target_columns = ['rainfall']

# Scale predictions using MinMaxScaler (this makes results comparable)
for i in range(len(sub_list)):
    scaler = MinMaxScaler()
    for col in target_columns:
        sub_list[i][col] = scaler.fit_transform(sub_list[i][col].values.reshape(-1, 1)).flatten()

# Define weights for each submission.
# In this example, we weight our own submission more heavily.
weights = np.square([7, 1, 1, 1, 1, 1, 1])

# (Optional) Repeat submissions according to weight to simulate weighted averaging
if len(sub_list) == len(weights):
    weighted_list = [item for sub, weight in zip(sub_list, weights) for item in [sub] * weight]

# Compute ensemble submissions using different means
am_submission = ensemble_mean(weighted_list, target_columns, mean="AM")
gm_submission = ensemble_mean(weighted_list, target_columns, mean="GM")
hm_submission = ensemble_mean(weighted_list, target_columns, mean="HM")

# Save the blended submissions to CSV files
am_submission.to_csv('submission_blended_am.csv', index=False)
gm_submission.to_csv('submission_blended_gm.csv', index=False)
hm_submission.to_csv('submission_blended_hm.csv', index=False)

print("Blended submissions saved (Arithmetic, Geometric, and Harmonic).")
