In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
import time

In [18]:
# Load the dataset
file_path = 'train.csv'  # Replace with your CSV file path
data = pd.read_csv(file_path)

In [19]:
# Applying logarithmic transformations
data['full_sq_log'] = np.log1p(data['full_sq'])
data['leisure_count_500_log'] = np.log1p(data['leisure_count_500'])
data['life_sq_log'] = np.log1p(data['life_sq'])
data['cafe_count_500_price_high_log'] = np.log1p(data['cafe_count_500_price_high'])

In [20]:
# Add log-transformed features to the list of key features
key_features = ['full_sq', 'life_sq', 'floor', 'leisure_count_500', 'cafe_count_1000_price_high',
                'mosque_count_500', 'mosque_count_1000', 'cafe_count_500_price_high',
                'cafe_count_1000_price_high', 'cafe_count_500_price_4000', 'culture_objects_top_25_raion', 
                'leisure_count_1000', 'trc_sqm_500', 'church_count_500', 'cafe_count_1000_price_1500', 
                'mosque_count_1500', 'cafe_count_500', 'big_church_count_500',
                'public_transport_station_min_walk', 'metro_min_walk', 'kindergarten_km', 'preschool_km',
                'full_sq_log', 'life_sq_log', 'leisure_count_500_log', 'cafe_count_500_price_high_log']  # Add log-transformed features
data = data[key_features + ['price_doc']]

In [21]:
# Handling any potential infinities or NaNs
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Splitting the data into features (X) and target variable (y)
X = data.drop('price_doc', axis=1)
y = data['price_doc']

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling using Robust Scaling
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [26]:
def get_model(model_name):
    if model_name == 'RandomForest':
        model = RandomForestRegressor(n_estimators=100, random_state=42, verbose=2)
    elif model_name == 'XGBoost':
        model = XGBRegressor(objective='reg:squarederror', n_estimators=2000, learning_rate=0.001, random_state=42, verbose=1)
    elif model_name == 'CatBoost':
        model = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6, silent=True, random_state=42, verbose=2)
    elif model_name == 'LightGBM':
        model = LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42, verbose=2)
    elif model_name == 'GradientBoost':
        model = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.15, random_state=42, verbose=2)
    else:
        raise ValueError("Unknown model name")
    return model


In [27]:
selected_model_name = 'GradientBoost' # Change this to select a different model

model_pipeline = Pipeline([
    ('scaler', RobustScaler()), # Or any other scaler you prefer
    ('regressor', get_model(selected_model_name))
])

# Train the model on the train dataset
start_time = time.time()  # Start time
model_pipeline.fit(X_train_scaled, y_train)
end_time = time.time()  # End time
training_time = end_time - start_time
print(f"Training Time: {training_time} seconds")

      Iter       Train Loss   Remaining Time 
         1 390598406280143.1875          111.87m
         2 329224337213870.1875          112.12m
         3 284762329054621.3750          108.51m
         4 252275046034571.5938          108.50m
         5 229040940671248.1562          108.54m
         6 211882621550895.1562          108.67m
         7 199313028869024.6250          107.89m
         8 190168482243834.6562          107.86m
         9 183526860660652.8750          106.97m
        10 178625960307092.0312          106.67m
        11 174992754941180.8750          106.30m
        12 172214786251963.8125          105.93m
        13 170190556795743.1875          105.15m
        14 168690514707280.7500          105.43m
        15 167568093780567.2188          105.25m
        16 166617338003000.5000          105.07m
        17 165951234014316.2500          105.00m
        18 165389231615164.2500          104.84m
        19 164968383036556.6250          104.77m
        20 164631147416

In [28]:
# Evaluate the model on the validation dataset
val_predictions = model_pipeline.predict(X_val_scaled)
rmse_val = sqrt(mean_squared_error(y_val, val_predictions))
print(f'Validation Root Mean Squared Error: {rmse_val}')

Validation Root Mean Squared Error: 13066756.88475668


In [29]:
# Prepare submission

file_path2 = 'test.csv'  # Replace with your CSV file path
test_data = pd.read_csv(file_path2)

# Applying logarithmic transformations
test_data['full_sq_log'] = np.log1p(test_data['full_sq'])
test_data['leisure_count_500_log'] = np.log1p(test_data['leisure_count_500'])
test_data['life_sq_log'] = np.log1p(test_data['life_sq'])
test_data['cafe_count_500_price_high_log'] = np.log1p(test_data['cafe_count_500_price_high'])

# Add log-transformed features to the list of key features, including 'row ID'
key_features = ['row ID', 'full_sq', 'life_sq', 'floor', 'leisure_count_500', 'cafe_count_1000_price_high',
                'mosque_count_500', 'mosque_count_1000', 'cafe_count_500_price_high',
                'cafe_count_1000_price_high', 'cafe_count_500_price_4000', 'culture_objects_top_25_raion', 
                'leisure_count_1000', 'trc_sqm_500', 'church_count_500', 'cafe_count_1000_price_1500', 
                'mosque_count_1500', 'cafe_count_500', 'big_church_count_500',
                'public_transport_station_min_walk', 'metro_min_walk', 'kindergarten_km', 'preschool_km',
                'full_sq_log', 'life_sq_log', 'leisure_count_500_log', 'cafe_count_500_price_high_log'] 
test_data = test_data[key_features]

# IMPORTANT: Scale the test data using the same scaler as the training data
X_test_scaled = scaler.transform(test_data.drop('row ID', axis=1))

# Make predictions on the test dataset
test_predictions = model_pipeline.predict(X_test_scaled)

# Prepare the submission DataFrame using 'row ID' as the identifier
submission = pd.DataFrame({
    'row ID': test_data['row ID'],  # Use 'row ID' as the identifier
    'price_doc': test_predictions
})

# Save the submission file
submission.to_csv('new-predictions.csv', index=False)

