### Importing Required Libraries
- pandas for data manipulation
- numpy for numerical operations
- scikit-learn for machine learning
- XGBoost for gradient boosting


In [33]:
import pickle
import numpy as np
import xgboost as xgb
import pandas as pd
from sklearn.utils import resample

### Semi-supervised Learning with XGBoost and Bootstrap Sampling
 
This notebook implements an iterative semi-supervised learning approach to improve model performance by leveraging unlabeled test data. The process involves:
1. Loading and preprocessing training and test data
2. Training an XGBoost regression model on labeled data
3. Using bootstrap sampling to estimate prediction uncertainty



### Loading Dataset
Loading the dataset for analysis


In [34]:
with open("/kaggle/input/xgb-bestmodel-datathon2025/best_xgb_model.pkl", "rb") as file:
    model = pickle.load(file)

### Loading Training Dataset
Loading the training data for model development


In [35]:
train_data = pd.read_csv("/kaggle/input/datathon2025/final_preprocessed_train.csv")
test_data = pd.read_csv("/kaggle/input/datathon2025/final_preprocessed_test.csv")

### Neural Network Model
Training a neural network model


In [36]:
train_data.drop(columns=["Unnamed: 0"], inplace=True, errors="ignore")
test_data.drop(columns=["Unnamed: 0"], inplace=True, errors="ignore")

### Code Execution
Executing code for data analysis or model development


In [37]:
iteration = 0
confidence_threshold = 60 

### Semi-supervised Learning Loop
This code implements an iterative semi-supervised learning approach where:
1. An XGBoost model is trained on labeled data
2. The model makes predictions on unlabeled test data
3. Bootstrap sampling is used to estimate prediction uncertainty
4. High confidence predictions (above threshold) are added to training data
5. Process repeats until no more high confidence predictions remain


## Prepare training data


### XGBoost Model
Training an XGBoost model


In [38]:
while not test_data.empty:
    print(f"Iteration {iteration + 1}: Training model...")

    # Prepare training data
    X_train = train_data.drop(columns=["SalePrice"])
    y_train = train_data["SalePrice"]

    # Train the model
    model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Ensure test data does not contain 'SalePrice' or 'Confidence_Percentage'
    test_features = test_data.drop(columns=["SalePrice", "Confidence_Percentage"], errors="ignore")

    # Predict on test set
    predictions = model.predict(test_features)

    # Bootstrap sampling for uncertainty estimation
    n_samples = 50
    bootstrap_preds = []

    for _ in range(n_samples):
        sample_data = resample(test_features)
        preds = model.predict(sample_data)
        bootstrap_preds.append(preds)

    bootstrap_preds = np.array(bootstrap_preds)

    # Compute mean and standard deviation
    mean_predictions = bootstrap_preds.mean(axis=0)
    std_predictions = bootstrap_preds.std(axis=0)

    confidence_percentage = 100 * (1 - (std_predictions / (std_predictions.max() + 1e-6)))
    confidence_percentage = np.clip(confidence_percentage, 0, 100)

    # Store results
    results_df = test_data.copy()
    results_df["SalePrice"] = mean_predictions
    results_df["Confidence_Percentage"] = confidence_percentage

    # Select high-confidence predictions
    filtered_df = results_df[results_df["Confidence_Percentage"] > confidence_threshold].drop(columns=["Confidence_Percentage"])

    if filtered_df.empty:
        print("No more high-confidence samples. Stopping training.")
        break

    # Update training and test data
    train_data = pd.concat([train_data, filtered_df], ignore_index=True)
    test_data = test_data.drop(filtered_df.index).reset_index(drop=True)

    train_data.to_csv(f"final_train_data_ssl{iteration+1}.csv")

    # Save updated model
    model_filename = f"updated_xgb_model_iteration_{iteration + 1}.pkl"
    with open(model_filename, "wb") as file:
        pickle.dump(model, file)
    print(f"Model saved: {model_filename}")

    iteration += 1

print("Semi-supervised learning process completed.")

Iteration 1: Training model...
Model saved: updated_xgb_model_iteration_1.pkl
Iteration 2: Training model...
Model saved: updated_xgb_model_iteration_2.pkl
Iteration 3: Training model...
Model saved: updated_xgb_model_iteration_3.pkl
Iteration 4: Training model...
Model saved: updated_xgb_model_iteration_4.pkl
Iteration 5: Training model...
Model saved: updated_xgb_model_iteration_5.pkl
Iteration 6: Training model...
No more high-confidence samples. Stopping training.
Semi-supervised learning process completed.
