In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
import time
# Load the dataset
file_path = 'train.csv'  # Replace with your CSV file path
data = pd.read_csv(file_path)
# Applying logarithmic transformations
data['full_sq_log'] = np.log1p(data['full_sq'])
data['leisure_count_500_log'] = np.log1p(data['leisure_count_500'])
data['life_sq_log'] = np.log1p(data['life_sq'])
data['cafe_count_500_price_high_log'] = np.log1p(data['cafe_count_500_price_high'])
# Add log-transformed features to the list of key features
key_features = ['full_sq', 'life_sq', 'floor', 'leisure_count_500', 'cafe_count_1000_price_high',
                'mosque_count_500', 'mosque_count_1000', 'cafe_count_500_price_high',
                'cafe_count_1000_price_high', 'cafe_count_500_price_4000', 'culture_objects_top_25_raion', 
                'leisure_count_1000', 'trc_sqm_500', 'church_count_500', 'cafe_count_1000_price_1500', 
                'mosque_count_1500', 'cafe_count_500', 'big_church_count_500',
                'public_transport_station_min_walk', 'metro_min_walk', 'kindergarten_km', 'preschool_km',
                'full_sq_log', 'life_sq_log', 'leisure_count_500_log', 'cafe_count_500_price_high_log']  # Add log-transformed features
data = data[key_features + ['price_doc']]
# Handling any potential infinities or NaNs
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Splitting the data into features (X) and target variable (y)
X = data.drop('price_doc', axis=1)
y = data['price_doc']

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling using Robust Scaling
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
def get_model(model_name):
    if model_name == 'RandomForest':
        model = RandomForestRegressor(n_estimators=100, random_state=42, verbose=2)
    elif model_name == 'XGBoost':
        model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, random_state=42, verbose=2)
    elif model_name == 'CatBoost':
        model = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6, silent=True, random_state=42, verbose=2)
    elif model_name == 'LightGBM':
        model = LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42, verbose=2)
    elif model_name == 'GradientBoost':
        model = GradientBoostingRegressor(n_estimators=500, learning_rate=0.3, random_state=42, verbose=2)
    else:
        raise ValueError("Unknown model name")
    return model

selected_model_name = 'XGBoost' # Change this to select a different model

model_pipeline = Pipeline([
    ('scaler', RobustScaler()), # Or any other scaler you prefer
    ('regressor', get_model(selected_model_name))
])


# Hyperparameter tuning for each model
param_grids = {
    'RandomForest': {
        'regressor__n_estimators': [1500, 250, 500],
        'regressor__max_depth': [10, 20, None],
        'regressor__min_samples_split': [2, 5, 10]
    },
    'XGBoost': {
        'regressor__n_estimators': [3000],
        'regressor__learning_rate': [ 0.1, 0.2],
        'regressor__max_depth': [6,8]
    },
    'LightGBM': {
        'regressor__n_estimators': [3000],
        'regressor__learning_rate': [ 0.1, 0.2],
        'regressor__num_leaves': [50, 100]
    },
    'GradientBoost': {
        'regressor__n_estimators': [500, 1500, 1000],
        'regressor__learning_rate': [0.05, 0.01, 0.1],
        'regressor__max_depth': [3, 4, 5]
    },
    'CatBoost': {
        'regressor__n_estimators': [100, 200, 300],
        'regressor__learning_rate': [0.05, 0.1, 0.2],
        'regressor__max_depth': [3, 4, 5]
    }
}
# Grid search for hyperparameter tuning
if selected_model_name in param_grids:
    grid_search = GridSearchCV(model_pipeline, param_grids[selected_model_name], cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=2)
    grid_search.fit(X_train_scaled, y_train)
    best_model = grid_search.best_estimator_
    print("Best parameters:", grid_search.best_params_)
else:
    # If no hyperparameter tuning is needed or if the model is not in param_grids
    start_time = time.time()
    model_pipeline.fit(X_train_scaled, y_train)
    end_time = time.time()
    print(f"Training Time: {end_time - start_time} seconds")
    best_model = model_pipeline
# Fit the entire pipeline on the training data
model_pipeline.fit(X_train_scaled, y_train)

# Evaluate the model on the validation dataset
val_predictions = model_pipeline.predict(X_val_scaled)
rmse_val = sqrt(mean_squared_error(y_val, val_predictions))
print(f'Validation Root Mean Squared Error: {rmse_val}')

# Prepare submission

file_path2 = 'test.csv'  # Replace with your CSV file path
test_data = pd.read_csv(file_path2)

# Applying logarithmic transformations
test_data['full_sq_log'] = np.log1p(test_data['full_sq'])
test_data['leisure_count_500_log'] = np.log1p(test_data['leisure_count_500'])
test_data['life_sq_log'] = np.log1p(test_data['life_sq'])
test_data['cafe_count_500_price_high_log'] = np.log1p(test_data['cafe_count_500_price_high'])

# Add log-transformed features to the list of key features, including 'row ID'
key_features = ['row ID', 'full_sq', 'life_sq', 'floor', 'leisure_count_500', 'cafe_count_1000_price_high',
                'mosque_count_500', 'mosque_count_1000', 'cafe_count_500_price_high',
                'cafe_count_1000_price_high', 'cafe_count_500_price_4000', 'culture_objects_top_25_raion', 
                'leisure_count_1000', 'trc_sqm_500', 'church_count_500', 'cafe_count_1000_price_1500', 
                'mosque_count_1500', 'cafe_count_500', 'big_church_count_500',
                'public_transport_station_min_walk', 'metro_min_walk', 'kindergarten_km', 'preschool_km',
                'full_sq_log', 'life_sq_log', 'leisure_count_500_log', 'cafe_count_500_price_high_log'] 
test_data = test_data[key_features]

# IMPORTANT: Scale the test data using the same scaler as the training data
X_test_scaled = scaler.transform(test_data.drop('row ID', axis=1))

# Make predictions on the test dataset
test_predictions = model_pipeline.predict(X_test_scaled)

# Prepare the submission DataFrame using 'row ID' as the identifier
submission = pd.DataFrame({
    'row ID': test_data['row ID'],  # Use 'row ID' as the identifier
    'price_doc': test_predictions
})

# Save the submission file
submission.to_csv('predictions.csv', index=False)

