In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [3]:
# Load training data
df = pd.read_csv('train.csv') 

In [4]:
# Selecting features and target
X = df.drop('price_doc', axis=1)
y = df['price_doc']

# Handling categorical variables
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(exclude=['object']).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Simplified regression pipeline without polynomial features
simple_regression_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

# Hyperparameters grid for Ridge regression
parameter_grid = {
    'regressor__alpha': np.logspace(-4, 4, 20)
}

# Grid search with cross-validation
grid_search = GridSearchCV(simple_regression_pipeline, parameter_grid, cv=KFold(n_splits=5), scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)



Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ............................regressor__alpha=0.0001; total time=  16.9s
[CV] END ............................regressor__alpha=0.0001; total time=  15.1s
[CV] END ............................regressor__alpha=0.0001; total time=  16.7s
[CV] END ............................regressor__alpha=0.0001; total time=  14.8s
[CV] END ............................regressor__alpha=0.0001; total time=  16.1s
[CV] END ............regressor__alpha=0.00026366508987303583; total time=  14.8s
[CV] END ............regressor__alpha=0.00026366508987303583; total time=  15.3s
[CV] END ............regressor__alpha=0.00026366508987303583; total time=  16.7s
[CV] END ............regressor__alpha=0.00026366508987303583; total time=  16.8s
[CV] END ............regressor__alpha=0.00026366508987303583; total time=  13.5s
[CV] END .............regressor__alpha=0.0006951927961775605; total time=  16.3s
[CV] END .............regressor__alpha=0.000695

In [7]:
# Evaluate the simplified model
y_pred_simple = grid_search.predict(X_val)
rmse_simple = np.sqrt(mean_squared_error(y_val, y_pred_simple))

print(f"RMSE (Simplified Model): {rmse_simple}")

RMSE (Simplified Model): 13266036.69727832


In [8]:
# Load test data
test_df = pd.read_csv('test.csv')

# Ensure correct column is dropped
X_test = test_df.drop(['row ID'], axis=1)  # Adjust the column name as per your dataset

# Make predictions directly using grid_search_simple (it will handle preprocessing)
y_test_pred = grid_search.predict(X_test)

# Create submission DataFrame
submission = pd.DataFrame({
    'row ID': test_df['row ID'],  # Adjust this as per your dataset
    'price_doc': y_test_pred
})

# Save the submission file
submission.to_csv('poly-without-submission.csv', index=False)



In [9]:
# Number of features after preprocessing
X_train_transformed = grid_search.best_estimator_.named_steps['preprocessor'].transform(X_train)
num_features = X_train_transformed.shape[1]
print(f"Number of features used in the final model: {num_features}")



Number of features used in the final model: 2214


In [10]:
# Get coefficients and intercept from the model
ridge_model = grid_search.best_estimator_.named_steps['regressor']
coefficients = ridge_model.coef_
intercept = ridge_model.intercept_

print(f"Model Intercept: {intercept}")
print(f"Model Coefficients: {coefficients}")


Model Intercept: 17576491.633222368
Model Coefficients: [ 744147.82656113  140981.31683924  556139.52059727 ... -990026.6103886
  372477.40338977  160830.51697355]
