In [None]:
import pickle
import numpy as np
import xgboost as xgb
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import optuna

In [None]:
train_data = pd.read_csv("/kaggle/input/datathon2025/final_preprocessed_train.csv")
test_data = pd.read_csv("/kaggle/input/datathon2025/final_preprocessed_test.csv")

In [None]:
train_data

In [None]:
train_data.drop(columns=["Unnamed: 0"], inplace=True, errors="ignore")
test_data.drop(columns=["Unnamed: 0"], inplace=True, errors="ignore")

In [None]:
iteration = 0
confidence_threshold = 60

In [None]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 800, 1500, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1)
    }
    model = xgb.XGBRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
while not test_data.empty:
    print(f"Iteration {iteration + 1}: Hyperparameter tuning and training model...")

    # Prepare training data
    X = train_data.drop(columns=["SalePrice"])
    y = train_data["SalePrice"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Perform hyperparameter tuning
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=200)  # Increase trials to 500
    best_params = study.best_params
    best_rmse = study.best_value
    
    print(f"Best hyperparameters found: {best_params}")
    print(f"Best RMSE obtained: {best_rmse}")
    
    # Train final model with best hyperparameters
    model = xgb.XGBRegressor(**best_params, random_state=42)
    model.fit(X, y)

    # Predict on test set
    test_features = test_data.drop(columns=["SalePrice", "Confidence_Percentage"], errors="ignore")
    predictions = model.predict(test_features)

    # Bootstrap sampling for uncertainty estimation
    n_samples = 40
    bootstrap_preds = []
    for _ in range(n_samples):
        sample_data = resample(test_features)
        preds = model.predict(sample_data)
        bootstrap_preds.append(preds)
    bootstrap_preds = np.array(bootstrap_preds)

    # Compute mean and standard deviation
    mean_predictions = bootstrap_preds.mean(axis=0)
    std_predictions = bootstrap_preds.std(axis=0)

    confidence_percentage = 100 * (1 - (std_predictions / (std_predictions.max() + 1e-6)))
    confidence_percentage = np.clip(confidence_percentage, 0, 100)

    # Store results
    results_df = test_data.copy()
    results_df["SalePrice"] = mean_predictions
    results_df["Confidence_Percentage"] = confidence_percentage

    # Select high-confidence predictions
    filtered_df = results_df[results_df["Confidence_Percentage"] > confidence_threshold].drop(columns=["Confidence_Percentage"])

    if filtered_df.empty:
        print("No more high-confidence samples. Stopping training.")
        break

    # Update training and test data
    train_data = pd.concat([train_data, filtered_df], ignore_index=True)
    test_data = test_data.drop(filtered_df.index).reset_index(drop=True)
    
    # Print remaining test dataset size
    print(f"Remaining test dataset size: {len(test_data)}")

    train_data.to_csv(f"final_train_data_ssl{iteration+1}.csv", index=False)

    # Save updated model
    model_filename = f"updated_xgb_model_iteration_{iteration + 1}.pkl"
    with open(model_filename, "wb") as file:
        pickle.dump(model, file)
    print(f"Model saved: {model_filename}")

    iteration += 1

print("Semi-supervised learning process completed.")
