In [1]:
import pandas as pd 

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

## Data Clean

In [3]:
missing_data = train.isnull().sum()
missing_data = missing_data[missing_data > 0]
missing_data.sort_values(inplace=True)
print(missing_data)

Electrical         1
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
BsmtExposure      38
BsmtFinType2      38
GarageCond        81
GarageQual        81
GarageFinish      81
GarageYrBlt       81
GarageType        81
LotFrontage      259
FireplaceQu      690
MasVnrType       872
Fence           1179
Alley           1369
MiscFeature     1406
PoolQC          1453
dtype: int64


In [4]:
cols_to_drop = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [6]:
corr_matrix = train.corr(numeric_only=True)
corr_matrix['SalePrice'].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePr

In [7]:
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import TransformedTargetRegressor
import numpy as np

housing = train.drop(["SalePrice", "Id"], axis=1)
housing_labels = train["SalePrice"].copy()

skew_limit = 0.75

num_attribs = housing.select_dtypes(include=['int64', 'float64']).columns.tolist()

skewed_feats = housing[num_attribs].skew().sort_values(ascending=False)

high_skew_features = skewed_feats[skewed_feats > skew_limit].index

num_attribs = [f for f in num_attribs if f not in high_skew_features]

if "MSSubClass" in num_attribs: num_attribs.remove("MSSubClass")
if "MSSubClass" in high_skew_features: high_skew_features = high_skew_features.drop("MSSubClass")



In [8]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso


one_hot_attribs = [
    "MSZoning", "Street", "LandContour", "LotConfig", "Neighborhood", 
    "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", 
    "RoofMatl", "Exterior1st", "Exterior2nd", "Foundation", 
    "Heating", "CentralAir", "Electrical", "GarageType", "SaleType", "SaleCondition"
]

one_hot_attribs.append("MSSubClass")

ordinal_attribs = [
    "LotShape", "Utilities", "LandSlope", "ExterQual", "ExterCond", "BsmtQual", 
    "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", 
    "KitchenQual", "Functional", "GarageFinish", "GarageQual", 
    "GarageCond", "PavedDrive"
]

none_cat_attribs = [
    "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
    "GarageType", "GarageFinish", "GarageQual", "GarageCond", "MasVnrType"
]


freq_cat_attribs = [col for col in one_hot_attribs + ordinal_attribs if col not in none_cat_attribs]


NoneImputerPipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='constant', fill_value='None')),
    ("ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])


StandardCatPipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='median')),
    ("std_scaler", StandardScaler())
])

log_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='median')),
    ("log", FunctionTransformer(np.log1p, feature_names_out="one-to-one")),
    ("std_scaler", StandardScaler())
])

preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("log", log_pipeline, high_skew_features),
    ("none_cats", NoneImputerPipeline, none_cat_attribs),
    ("std_cats", StandardCatPipeline, freq_cat_attribs)
])


# Select and Train a Model

In [9]:
from scipy.stats import reciprocal, uniform, randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression


full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("feature_selection", SelectFromModel(Lasso(alpha=0.001, random_state=42))),
    ("model", LinearRegression()) # placeholder
])

final_model = TransformedTargetRegressor(
    regressor=full_pipeline, 
    func=np.log1p, 
    inverse_func=np.expm1
)

param_dist = [
    {
        'regressor__model': [SVR()],
        'regressor__model__C': reciprocal(1, 200000), 
        'regressor__model__epsilon': uniform(0.01, 1),
        'regressor__model__kernel': ['rbf'], # linear is usually too slow here
        'regressor__feature_selection__estimator__alpha': uniform(0.0001, 0.005) 
    },
    
    {
        'regressor__model': [RandomForestRegressor(random_state=42)],
        'regressor__model__n_estimators': randint(100, 300),
        'regressor__model__max_features': randint(10, 40),
        'regressor__feature_selection__estimator__alpha': uniform(0.00001, 0.002) 
    }
]

search = RandomizedSearchCV(
    final_model, 
    param_distributions=param_dist,
    n_iter=20, 
    cv=5, 
    scoring='neg_mean_squared_log_error',
    random_state=42,
    verbose=1 # to see progress
)

search.fit(housing, housing_labels)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


0,1,2
,estimator,TransformedTa...gression())]))
,param_distributions,"[{'regressor__feature_selection__estimator__alpha': <scipy.stats....00264DA909310>, 'regressor__model': [SVR()], 'regressor__model__C': <scipy.stats....00264DA954980>, 'regressor__model__epsilon': <scipy.stats....00264DA909090>, ...}, {'regressor__feature_selection__estimator__alpha': <scipy.stats....00264DA8D1810>, 'regressor__model': [RandomForestR...ndom_state=42)], 'regressor__model__max_features': <scipy.stats....00264DA9096D0>, 'regressor__model__n_estimators': <scipy.stats....00264DA954D70>}]"
,n_iter,20
,scoring,'neg_mean_squared_log_error'
,n_jobs,
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('num', ...), ('log', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<ufunc 'log1p'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'None'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimator,Lasso(alpha=n...ndom_state=42)
,threshold,
,prefit,False
,norm_order,1
,max_features,
,importance_getter,'auto'

0,1,2
,alpha,np.float64(0....0932022121826)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'

0,1,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,np.float64(6.713231619759599)
,epsilon,np.float64(0....8361216819946)
,shrinking,True
,cache_size,200
,verbose,False


In [10]:
search.best_params_

{'regressor__feature_selection__estimator__alpha': np.float64(0.0008800932022121826),
 'regressor__model': SVR(),
 'regressor__model__C': np.float64(6.713231619759599),
 'regressor__model__epsilon': np.float64(0.06808361216819946),
 'regressor__model__kernel': 'rbf'}

In [11]:
search.best_estimator_

0,1,2
,regressor,Pipeline(step...216819946)))])
,transformer,
,func,<ufunc 'log1p'>
,inverse_func,<ufunc 'expm1'>
,check_inverse,True

0,1,2
,transformers,"[('num', ...), ('log', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<ufunc 'log1p'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'None'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimator,Lasso(alpha=n...ndom_state=42)
,threshold,
,prefit,False
,norm_order,1
,max_features,
,importance_getter,'auto'

0,1,2
,alpha,np.float64(0....0932022121826)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'

0,1,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,np.float64(6.713231619759599)
,epsilon,np.float64(0....8361216819946)
,shrinking,True
,cache_size,200
,verbose,False


In [12]:
results_df = pd.DataFrame(search.cv_results_)

# Sort by score (highest is better because it's negative error)
results_df = results_df.sort_values(by="mean_test_score", ascending=False)

# Show the top 5 rows with relevant columns
cols = [
    "mean_test_score", 
    "param_regressor__model", 
    "param_regressor__feature_selection__estimator__alpha"
]
print(results_df[cols].head(5))

    mean_test_score                  param_regressor__model  \
1         -0.017345                                   SVR()   
17        -0.018886  RandomForestRegressor(random_state=42)   
16        -0.019034  RandomForestRegressor(random_state=42)   
19        -0.019041  RandomForestRegressor(random_state=42)   
11        -0.019114  RandomForestRegressor(random_state=42)   

    param_regressor__feature_selection__estimator__alpha  
1                                            0.000880     
17                                           0.001560     
16                                           0.001145     
19                                           0.001151     
11                                           0.000619     


In [13]:
# 1. Get the Best Model (The Wrapper)
best_model_wrapper = search.best_estimator_

# 2. Get the Internal Pipeline (Inside the Wrapper)
best_pipeline = best_model_wrapper.regressor_

# 3. Get the Feature Names (From Preprocessing)
# These are the 200+ columns created by OneHotEncoder/StandardScaler
preprocessor = best_pipeline.named_steps['preprocessing']
feature_names = preprocessor.get_feature_names_out()

# 4. Get the Boolean Mask (From Lasso)
# True = Kept, False = Dropped
selector = best_pipeline.named_steps['feature_selection']

# Handle "passthrough" case (if Random Forest won and skipped Lasso)
if selector == 'passthrough':
    selected_features = feature_names
    print(f"All {len(selected_features)} features were kept (Selector was skipped).")
else:
    mask = selector.get_support()
    selected_features = feature_names[mask]
    
    print(f"Lasso kept {len(selected_features)} features out of {len(feature_names)}.")
    
    # 5. Show the kept features
    print("\n--- Top 10 Selected Features ---")
    print(selected_features[:10])

Lasso kept 87 features out of 247.

--- Top 10 Selected Features ---
['num__OverallQual' 'num__OverallCond' 'num__YearBuilt'
 'num__YearRemodAdd' 'num__BsmtFullBath' 'num__FullBath' 'num__HalfBath'
 'num__BedroomAbvGr' 'num__TotRmsAbvGrd' 'num__Fireplaces']


## Evaluate on the Test Set

In [14]:
final_predictions = search.best_estimator_.predict(test)

submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": final_predictions
})

submission.to_csv("submission.csv", index=False)
print("submission.csv saved successfully!")

submission.csv saved successfully!
