In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

# Load the dataset
file_path = 'train.csv'  # Replace with your CSV file path
data = pd.read_csv(file_path)

# Applying logarithmic transformations
data['full_sq_log'] = np.log1p(data['full_sq'])
data['leisure_count_500_log'] = np.log1p(data['leisure_count_500'])
data['life_sq_log'] = np.log1p(data['life_sq'])
data['cafe_count_500_price_high_log'] = np.log1p(data['cafe_count_500_price_high'])

# Add log-transformed features to the list of key features
key_features = ['full_sq', 'life_sq', 'floor', 'leisure_count_500', 'cafe_count_1000_price_high',
                'mosque_count_500', 'mosque_count_1000', 'cafe_count_500_price_high',
                'cafe_count_1000_price_high', 'cafe_count_500_price_4000', 'culture_objects_top_25_raion', 
                'leisure_count_1000', 'trc_sqm_500', 'church_count_500', 'cafe_count_1000_price_1500', 
                'mosque_count_1500', 'cafe_count_500', 'big_church_count_500',
                'public_transport_station_min_walk', 'metro_min_walk', 'kindergarten_km', 'preschool_km',
                'full_sq_log', 'life_sq_log', 'leisure_count_500_log', 'cafe_count_500_price_high_log']

data = data[key_features + ['price_doc']]

# Handling any potential infinities or NaNs
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Splitting the data into features (X) and target variable (y)
X = data.drop('price_doc', axis=1)
y = data['price_doc']

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling using Robust Scaling
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'learning_rate': [0.05, 0.01],
    'n_estimators': [ 500, 1000],
    'max_depth': [4, 6, 8],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'gamma': [0.1]
}

xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

grid_search = GridSearchCV(
    xgb_model,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',  # Use negative mean squared error for GridSearchCV
    verbose=1
)

grid_search.fit(X_train_scaled, y_train)

# Print the best parameters found by GridSearchCV
best_params = grid_search.best_params_
print("Best hyperparameters found by GridSearchCV:")
print(best_params)

# Train the model with the best parameters
best_xgb_model = grid_search.best_estimator_

# Make predictions on the validation set
y_val_pred = best_xgb_model.predict(X_val_scaled)

# Calculate RMSE on the validation set
rmse = sqrt(mean_squared_error(y_val, y_val_pred))
print(f"Validation RMSE: {rmse}")

# Print the number of features used
num_features_used = len(best_xgb_model.feature_importances_.nonzero()[0])
print(f"Number of features used: {num_features_used}")

# Load the test dataset
test_file_path = 'test.csv'  # Replace with your test CSV file path
test_data = pd.read_csv(test_file_path)

# Applying the same logarithmic transformations as in the training data
test_data['full_sq_log'] = np.log1p(test_data['full_sq'])
test_data['leisure_count_500_log'] = np.log1p(test_data['leisure_count_500'])
test_data['life_sq_log'] = np.log1p(test_data['life_sq'])
test_data['cafe_count_500_price_high_log'] = np.log1p(test_data['cafe_count_500_price_high'])

# Handling any potential infinities or NaNs
test_data.replace([np.inf, -np.inf], np.nan, inplace=True)
test_data.fillna(0, inplace=True)

# Selecting the same features as used in the training data
X_test = test_data[['full_sq', 'life_sq', 'floor', 'leisure_count_500', 'cafe_count_1000_price_high',
                    'mosque_count_500', 'mosque_count_1000', 'cafe_count_500_price_high',
                    'cafe_count_1000_price_high', 'cafe_count_500_price_4000', 'culture_objects_top_25_raion', 
                    'leisure_count_1000', 'trc_sqm_500', 'church_count_500', 'cafe_count_1000_price_1500', 
                    'mosque_count_1500', 'cafe_count_500', 'big_church_count_500',
                    'public_transport_station_min_walk', 'metro_min_walk', 'kindergarten_km', 'preschool_km',
                    'full_sq_log', 'life_sq_log', 'leisure_count_500_log', 'cafe_count_500_price_high_log']]

# Scale the test data using the same scaler as the training data
X_test_scaled = scaler.transform(X_test)

# Make predictions using the trained XGBoost model
predictions = best_xgb_model.predict(X_test_scaled)

# Prepare submission
submission = pd.DataFrame({
    'row ID': test_data['row ID'],  # Replace 'ID' with the identifier column of your test dataset
    'price_doc': predictions
})
submission.to_csv('predictions_xgboost_gridsearch.csv', index=False)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best hyperparameters found by GridSearchCV:
{'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 8, 'min_child_weight': 1, 'n_estimators': 500, 'subsample': 0.8}
Validation RMSE: 12707548.723359141
Number of features used: 26
