#### Import Libraries

In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#### Train Data CSV -> Dataframe

In [2]:
df=pd.read_csv('train.csv')
df.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


In [3]:
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [4]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

#### Data Transformation

In [9]:
#Input Independent Features
X=df.drop('SalePrice',axis=1)

In [10]:
#Output Independent Feature
y=df['SalePrice']

#### Split train and test data

In [13]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [16]:
# Select numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Select categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols.remove('SalePrice')


In [17]:
## Feature Engineering Automation

#non-categorical Pipeline
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')), ##missing values
        ('scaler',StandardScaler())## feature scaling 
    ]

)

#categorical Pipeline
cat_pipeline=Pipeline(
                steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')), ## handling Missing values
                ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))
                ]

            )  

In [18]:
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)

])

In [19]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [37]:
# Define individual regression models
random_forest_regressor = RandomForestRegressor()
extra_trees_regressor = ExtraTreesRegressor()
gradient_boosting_regressor = GradientBoostingRegressor()
ada_boost_regressor = AdaBoostRegressor()
decision_tree_regressor = DecisionTreeRegressor()
svr = SVR()
k_neighbors_regressor = KNeighborsRegressor()
linear_regression = LinearRegression()
ridge_regressor = Ridge()
lasso_regressor = Lasso()
elastic_net_regressor = ElasticNet()
mlp_regressor = MLPRegressor()
gaussian_process_regressor = GaussianProcessRegressor()
xgb_regressor = XGBRegressor()
lgbm_regressor = LGBMRegressor()
catboost_regressor = CatBoostRegressor(verbose=False)

# Dictionary of regression models
models = {
    'Random Forest Regressor': random_forest_regressor,
    'Extra Trees Regressor': extra_trees_regressor,
    'Gradient Boosting Regressor': gradient_boosting_regressor,
    'AdaBoost Regressor': ada_boost_regressor,
    'Decision Tree Regressor': decision_tree_regressor,
    'SVR': svr,
    'KNeighbors Regressor': k_neighbors_regressor,
    'Linear Regression': linear_regression,
    'Ridge Regressor': ridge_regressor,
    'Lasso Regressor': lasso_regressor,
    'ElasticNet Regressor': elastic_net_regressor,
    'XGBoost Regressor': xgb_regressor,
    'LGBM Regressor': lgbm_regressor,
    'CatBoost Regressor': catboost_regressor
}

In [38]:
def evaluate_model(X_train, y_train, X_test, y_test, models):
    best_model_name = None
    best_r2_score = -float('inf')  # Initialize with a very low value
    
    for name, model in models.items():
        # Train the model on the entire training set
        model.fit(X_train, y_train)
        
        # Evaluate the model on the test set using R-squared score
        test_r2_score = model.score(X_test, y_test)
        print(f'{name}: Test R-squared Score = {test_r2_score:.2f}\n')
        
        # Update best model if necessary
        if test_r2_score > best_r2_score:
            best_r2_score = test_r2_score
            best_model_name = name
    
    return best_model_name

best_model = evaluate_model(X_train, y_train, X_test, y_test, models)
print(f'Best Model: {best_model}')


Random Forest Regressor: Test R-squared Score = 0.89

Extra Trees Regressor: Test R-squared Score = 0.90

Gradient Boosting Regressor: Test R-squared Score = 0.90

AdaBoost Regressor: Test R-squared Score = 0.84

Decision Tree Regressor: Test R-squared Score = 0.76

SVR: Test R-squared Score = -0.02

KNeighbors Regressor: Test R-squared Score = 0.81

Linear Regression: Test R-squared Score = 0.89

Ridge Regressor: Test R-squared Score = 0.88



  model = cd_fast.sparse_enet_coordinate_descent(


Lasso Regressor: Test R-squared Score = 0.90

ElasticNet Regressor: Test R-squared Score = 0.83

XGBoost Regressor: Test R-squared Score = 0.88

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3460
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 180
[LightGBM] [Info] Start training from score 181441.541952
LGBM Regressor: Test R-squared Score = 0.88

CatBoost Regressor: Test R-squared Score = 0.90

Best Model: CatBoost Regressor


In [316]:
catboost_regressor = CatBoostRegressor(
    learning_rate=0.07,
    depth=5,
    n_estimators=600,
    subsample=0.7,
    colsample_bylevel=0.7,
    l2_leaf_reg=1.0,
    random_seed=42,
    bootstrap_type='Bernoulli',
    min_data_in_leaf=10,
    od_type='Iter',
    od_wait=20,
    loss_function='RMSE',
    verbose=False
)


In [317]:
best_model=catboost_regressor

In [318]:
# Train (fit) the model
best_model.fit(X_train, y_train)


<catboost.core.CatBoostRegressor at 0x17654fca310>

In [319]:
# Now make predictions
y_pred = best_model.predict(X_test)

In [320]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae:.2f}')

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')

# R-squared (Coefficient of Determination)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2:.2f}')

Mean Absolute Error: 14873.57
Mean Squared Error: 605565860.15
R-squared: 0.92


#### Test CSV 

In [321]:
test_data = pd.read_csv('test.csv')

In [322]:
X_test_data = test_data

In [323]:
X_test_data=preprocessor.transform(X_test_data)

In [324]:
# Make predictions
test_predictions = best_model.predict(X_test_data)

In [325]:
# Create a new DataFrame or Series to hold the predicted values
predictions_df = pd.DataFrame({'predictions': test_predictions})

In [326]:
final_df=pd.DataFrame()
final_df['Id'] = test_data['Id']
final_df['SalePrice'] = predictions_df ['predictions']

In [327]:
# Save the DataFrame with predictions to a CSV file
final_df.to_csv('test_predictions7.csv', index=False)