In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
import numpy as np


In [2]:
# Load data
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,full_sq,life_sq,floor,product_type,sub_area,area_m,raion_popul,green_zone_part,indust_part,children_preschool,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,43.0,27.0,4.0,Investment,Bibirevo,6407578.1,155572.0,0.189727,7e-05,9576.0,...,9.0,4.0,0.0,13.0,22.0,1.0,0.0,52.0,4.0,5850000.0
1,34.0,19.0,3.0,Investment,Nagatinskij Zaton,9589336.912,115352.0,0.372602,0.049637,6880.0,...,15.0,3.0,0.0,15.0,29.0,1.0,10.0,66.0,14.0,6000000.0
2,43.0,29.0,2.0,Investment,Tekstil'shhiki,4808269.831,101708.0,0.11256,0.118537,5879.0,...,10.0,3.0,0.0,11.0,27.0,0.0,4.0,67.0,10.0,5700000.0
3,77.0,77.0,4.0,Investment,Basmannoe,8398460.622,108171.0,0.015234,0.037316,5706.0,...,319.0,108.0,17.0,135.0,236.0,2.0,91.0,195.0,14.0,16331452.0
4,67.0,46.0,14.0,Investment,Nizhegorodskoe,7506452.02,43795.0,0.00767,0.486246,2418.0,...,62.0,14.0,1.0,53.0,78.0,1.0,20.0,113.0,17.0,9100000.0


In [4]:
# Selecting features and target
X = df.drop('price_doc', axis=1) # Assuming 'price_doc' is the target variable
y = df['price_doc']

# Handling categorical variables
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(exclude=['object']).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [5]:
# Create a pipeline
regression_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly_features', PolynomialFeatures(interaction_only=True)),
    ('feature_selection', SelectFromModel(Ridge())),
    ('regressor', Ridge())
])


In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Define a simpler grid of hyperparameters to search
parameter_grid = {
    'poly_features__degree': [1],  # Only use linear features
    'regressor__alpha': [1, 10, 100],  # Fewer values for alpha
    'feature_selection__threshold': [1e-5, 1e-4]  # Simplified thresholds
}

# Grid search with cross-validation and error handling
grid_search = GridSearchCV(
    regression_pipeline, 
    parameter_grid, 
    cv=3, 
    scoring='neg_mean_squared_error', 
    verbose=2, 
    error_score='raise'  # This will provide more detailed error messages
)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] END feature_selection__threshold=1e-05, poly_features__degree=1, regressor__alpha=1; total time=  22.3s
[CV] END feature_selection__threshold=1e-05, poly_features__degree=1, regressor__alpha=1; total time=  25.7s
[CV] END feature_selection__threshold=1e-05, poly_features__degree=1, regressor__alpha=1; total time=  22.8s
[CV] END feature_selection__threshold=1e-05, poly_features__degree=1, regressor__alpha=10; total time=  18.7s
[CV] END feature_selection__threshold=1e-05, poly_features__degree=1, regressor__alpha=10; total time=  21.0s
[CV] END feature_selection__threshold=1e-05, poly_features__degree=1, regressor__alpha=10; total time=  19.2s
[CV] END feature_selection__threshold=1e-05, poly_features__degree=1, regressor__alpha=100; total time=  16.1s
[CV] END feature_selection__threshold=1e-05, poly_features__degree=1, regressor__alpha=100; total time=  18.0s
[CV] END feature_selection__threshold=1e-05, poly_features__d

In [8]:
# Assuming grid_search is the trained GridSearchCV object

# Retrieve the best estimator
best_model = grid_search.best_estimator_

# Access the feature selection step
selected_features = best_model.named_steps['feature_selection']

# Get the support mask for selected features
feature_mask = selected_features.get_support()

# Count the number of selected features
num_selected_features = sum(feature_mask)

print(f"Number of features used: {num_selected_features}")


Number of features used: 2215


In [9]:
# Access the regression step
ridge_regressor = best_model.named_steps['regressor']

# Get coefficients and intercept
coefficients = ridge_regressor.coef_
intercept = ridge_regressor.intercept_

print(f"Model Intercept: {intercept}")
print(f"Model Coefficients: {coefficients}")



Model Intercept: 17468237.252286386
Model Coefficients: [ 4.65892664e-05  7.69648515e+05  1.55890938e+05 ... -9.80613820e+05
  3.72739702e+05  2.13552571e+05]


In [10]:
# Best parameters
best_params = grid_search.best_params_

# Predict and evaluate
y_pred = grid_search.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print(f"Best Parameters: {best_params}")
print(f"RMSE: {rmse}")


Best Parameters: {'feature_selection__threshold': 1e-05, 'poly_features__degree': 1, 'regressor__alpha': 100}
RMSE: 13269645.987827703


In [None]:
#Load data
test_df = pd.read_csv('test.csv')

# Preprocess the test data
X_test_processed = grid_search.best_estimator_.named_steps['preprocessor'].transform(test_df)

# Apply polynomial transformation
X_test_poly = grid_search.best_estimator_.named_steps['poly_features'].transform(X_test_processed)

# Apply feature selection if it was part of your pipeline
if 'feature_selection' in grid_search.best_estimator_.named_steps:
    X_test_poly = grid_search.best_estimator_.named_steps['feature_selection'].transform(X_test_poly)

# Predict
y_test_pred = grid_search.best_estimator_.named_steps['regressor'].predict(X_test_poly)



In [None]:
# Assuming 'id' is the column in test dataset that corresponds to the ID
submission_df = pd.DataFrame({
    'row ID': test_df['row ID'], 
    'price_doc': y_test_pred
})



In [None]:
# Create the submission file
submission_df.to_csv('poly-with-submission.csv', index=False)
