In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from math import sqrt
from xgboost import XGBRegressor

# Load the dataset
file_path = 'train.csv'  # Replace with your CSV file path
data = pd.read_csv(file_path)

# Applying logarithmic transformations
data['full_sq_log'] = np.log1p(data['full_sq'])
data['leisure_count_500_log'] = np.log1p(data['leisure_count_500'])
data['life_sq_log'] = np.log1p(data['life_sq'])
data['cafe_count_500_price_high_log'] = np.log1p(data['cafe_count_500_price_high'])

# Add log-transformed features to the list of key features
key_features = ['full_sq', 'life_sq', 'floor', 'leisure_count_500', 'cafe_count_1000_price_high',
                'mosque_count_500', 'mosque_count_1000', 'cafe_count_500_price_high',
                'cafe_count_1000_price_high', 'cafe_count_500_price_4000', 'culture_objects_top_25_raion', 
                'leisure_count_1000', 'trc_sqm_500', 'church_count_500', 'cafe_count_1000_price_1500', 
                'mosque_count_1500', 'cafe_count_500', 'big_church_count_500',
                'public_transport_station_min_walk', 'metro_min_walk', 'kindergarten_km', 'preschool_km',
                'full_sq_log', 'life_sq_log', 'leisure_count_500_log', 'cafe_count_500_price_high_log']
data = data[key_features + ['price_doc']]

# Handling any potential infinities or NaNs
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Splitting the data into features (X) and target variable (y)
X = data.drop('price_doc', axis=1)
y = data['price_doc']

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling using Robust Scaling
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Define the XGBoost model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Hyperparameter tuning for XGBoost
param_grid = {
    'n_estimators': [2000, 1500],
    'learning_rate': [0.001, 0.005],
    'max_depth': [5, 10]
}

grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Load test data
file_path2 = 'test.csv'  # Replace with your CSV file path
test_data = pd.read_csv(file_path2)

# Applying same transformations to test data
test_data['full_sq_log'] = np.log1p(test_data['full_sq'])
test_data['leisure_count_500_log'] = np.log1p(test_data['leisure_count_500'])
test_data['life_sq_log'] = np.log1p(test_data['life_sq'])
test_data['cafe_count_500_price_high_log'] = np.log1p(test_data['cafe_count_500_price_high'])

test_data = test_data[key_features]  # Only using key features
X_test_scaled = scaler.transform(test_data)

# Iterate over each combination of parameters
for i, params in enumerate(grid_search.cv_results_['params']):
    model = XGBRegressor(objective='reg:squarederror', random_state=42, **params)
    model.fit(X_train_scaled, y_train)
    
    # Make predictions on the test dataset
    test_predictions = model.predict(X_test_scaled)

    # Prepare the submission DataFrame using DataFrame index as identifier
    submission = pd.DataFrame({
        'row ID': test_data.index + 1,  # Using index + 1 as identifier
        'price_doc': test_predictions
    })

    # Save the submission file
    submission_filename = f'submission_{i}.csv'
    submission.to_csv(submission_filename, index=False)

    print(f"Created submission file: {submission_filename} with parameters: {params}")


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Created submission file: submission_0.csv with parameters: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 2000}
Created submission file: submission_1.csv with parameters: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 1500}
Created submission file: submission_2.csv with parameters: {'learning_rate': 0.001, 'max_depth': 10, 'n_estimators': 2000}
Created submission file: submission_3.csv with parameters: {'learning_rate': 0.001, 'max_depth': 10, 'n_estimators': 1500}
Created submission file: submission_4.csv with parameters: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 2000}
Created submission file: submission_5.csv with parameters: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 1500}
Created submission file: submission_6.csv with parameters: {'learning_rate': 0.005, 'max_depth': 10, 'n_estimators': 2000}
Created submission file: submission_7.csv with parameters: {'learning_rate': 0.005,