In [42]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# Comment this if the data visualisations doesn't work on your side
%matplotlib inline

In [43]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [44]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [45]:
train_df.shape, test_df.shape

((1460, 81), (1459, 80))

In [46]:
combined_df = pd.concat([train_df, test_df], ignore_index=True)
combined_df.shape

(2919, 81)

In [47]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2915 non-null   object 
 3   LotFrontage    2433 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          198 non-null    object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   Utilities      2917 non-null   object 
 10  LotConfig      2919 non-null   object 
 11  LandSlope      2919 non-null   object 
 12  Neighborhood   2919 non-null   object 
 13  Condition1     2919 non-null   object 
 14  Condition2     2919 non-null   object 
 15  BldgType       2919 non-null   object 
 16  HouseStyle     2919 non-null   object 
 17  OverallQual    2919 non-null   int64  
 18  OverallC

In [48]:
combined_df['SalePrice'].head()

0    208500.0
1    181500.0
2    223500.0
3    140000.0
4    250000.0
Name: SalePrice, dtype: float64

In [49]:
combined_df['SalePrice'].tail()

2914   NaN
2915   NaN
2916   NaN
2917   NaN
2918   NaN
Name: SalePrice, dtype: float64

In [50]:
combined_df = combined_df.drop('Id', axis=1)

In [51]:
combined_df.isna().sum().sum()

7829

In [52]:
na_counts = combined_df.isna().sum()
na_counts = na_counts[na_counts>600]
na_counts

Alley          1369
MasVnrType      872
FireplaceQu     690
PoolQC         1453
Fence          1179
MiscFeature    1406
dtype: int64

In [53]:
combined_df.drop(columns=['Alley','MasVnrType','FireplaceQu','PoolQC','Fence','MiscFeature'], axis=1, inplace=True)


In [54]:
na_counts = combined_df.isna().sum()
na_counts = na_counts[na_counts>0]
na_types = combined_df.dtypes[na_counts.index]
na_info = pd.DataFrame({
    'NaN Count': na_counts,
    'Data Type': na_types
})
na_info

Unnamed: 0,NaN Count,Data Type
LotFrontage,259,float64
MasVnrArea,8,float64
BsmtQual,37,object
BsmtCond,37,object
BsmtExposure,38,object
BsmtFinType1,37,object
BsmtFinType2,38,object
Electrical,1,object
GarageType,81,object
GarageYrBlt,81,float64


In [55]:
combined_df['LotFrontage'].fillna(combined_df['LotFrontage'].mean(), inplace=True)
combined_df['MasVnrArea'].fillna(combined_df['MasVnrArea'].mean(), inplace=True)
combined_df['GarageYrBlt'].fillna(combined_df['GarageYrBlt'].mean(), inplace=True)

mode = combined_df['Electrical'].mode().iloc[0]
combined_df['Electrical'] = combined_df['Electrical'].fillna(mode)

na_counts = combined_df.isna().sum()
na_counts = na_counts[na_counts>0]
na_types = combined_df.dtypes[na_counts.index]
na_info = pd.DataFrame({
    'NaN Count': na_counts,
    'Data Type': na_types
})
na_info

Unnamed: 0,NaN Count,Data Type
BsmtQual,37,object
BsmtCond,37,object
BsmtExposure,38,object
BsmtFinType1,37,object
BsmtFinType2,38,object
GarageType,81,object
GarageFinish,81,object
GarageQual,81,object
GarageCond,81,object


In [58]:
# in case of no garage and no basement
combined_df.fillna('NA', inplace=True)
combined_df.isna().sum().sum()

0

In [59]:
categorical_features = combined_df.select_dtypes(include=['object']).columns
categorical_features

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [60]:
# Apply one-hot encoding to categorical features
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_features = encoder.fit_transform(combined_df[categorical_features])
encoded_feature_names = encoder.get_feature_names_out(categorical_features)
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)

# Concatenate the encoded features with the original DataFrame
combined_df = pd.concat([combined_df.drop(columns=categorical_features), encoded_df], axis=1)
combined_df.shape



(1460, 276)

Index(['MSZoning', 'LotFrontage', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir',
       'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object')



(1460, 276)

{'Exterior1st_NA',
 'Exterior2nd_NA',
 'Functional_NA',
 'KitchenQual_NA',
 'MSZoning_NA',
 'SaleType_NA',
 'Utilities_NA'}

In [242]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop(columns=['SalePrice']))

In [243]:
scaler1 = StandardScaler()
scaled_features1 = scaler1.fit_transform(df1)

In [244]:
y = df['SalePrice'].values
x = scaled_features

In [245]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 1)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1168, 275), (1168,), (292, 275), (292,))

In [246]:
# Linear Regression: LR, 
# Ridge Regression: RR,
# Lasso Regression: LASSO,
# Decision Tree Regression: DTR,
# Random Forest Regression:RFR

# Mean Squared Error : MSE,
# Mean Absolute Error: MAE,
# Root Mean Squared Error: RMSE,
# R-squared:R2

models = {
    "LR": LinearRegression(),
    "RR": Ridge(),
    "LASSO": Lasso(),
    "DTR": DecisionTreeRegressor(),
    "RFR": RandomForestRegressor()
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MSE": mse, "MAE": mae, "RMSE": rmse, "R2": r2}

# Convert results to DataFrame for easy comparison
results_df = pd.DataFrame(results)
print(results_df)

  model = cd_fast.enet_coordinate_descent(


                LR            RR         LASSO           DTR           RFR
MSE   1.041890e+31  8.801746e+08  8.691301e+08  2.086741e+09  7.188822e+08
MAE   3.726469e+14  1.840985e+04  1.835652e+04  2.700481e+04  1.662797e+04
RMSE  3.227832e+15  2.966774e+04  2.948101e+04  4.568086e+04  2.681198e+04
R2   -1.460878e+21  8.765870e-01  8.781356e-01  7.074092e-01  8.992025e-01


In [247]:
df = pd.DataFrame({
    'y_test': y_test,
    'y_pred': y_pred
})
df

Unnamed: 0,y_test,y_pred
0,231500,211905.00
1,179500,160761.77
2,122000,119198.00
3,84500,78088.22
4,142000,152965.37
...,...,...
287,103200,99293.54
288,249700,246206.66
289,64500,106611.71
290,83000,97223.26


In [248]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

result = {}
hyperparams = {}
result['default'] = {"MSE": mse, "MAE": mae, "RMSE": rmse, "R2": r2}
hyperparams['default'] = model.get_params()

In [177]:
# model = RandomForestRegressor(n_estimators=90, 
#                               max_depth=2,
#                               min_samples_split=,
#                               min_samples_leaf=,
#                               max_features=
#                              )
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# mae = mean_absolute_error(y_test, y_pred)
# rmse = mean_squared_error(y_test, y_pred, squared=False)
# r2 = r2_score(y_test, y_pred)
# result['finetuned'] = {"MSE": mse, "MAE": mae, "RMSE": rmse, "R2": r2}
# results_df = pd.DataFrame(result)
# print(results_df)

SyntaxError: invalid syntax (3374857756.py, line 3)

In [181]:
from sklearn.model_selection import GridSearchCV

rfr = RandomForestRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Set up the GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit GridSearchCV to your training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best hyperparameters:", best_params)


405 fits failed out of a total of 1215.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
405 fits failed with the following error:
Traceback (most recent call last):
  File "/home/akshirsagar/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/akshirsagar/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/home/akshirsagar/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/home/akshirsagar/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, 

Best hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 150}


In [249]:
model1 = RandomForestRegressor(**best_params)
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
result['finetuned'] = {"MSE": mse, "MAE": mae, "RMSE": rmse, "R2": r2}
results_df = pd.DataFrame(result)
print(results_df)

           default     finetuned
MSE   7.362533e+08  1.055408e+09
MAE   1.666310e+04  1.745617e+04
RMSE  2.713399e+04  3.248704e+04
R2    8.967668e-01  8.520168e-01


In [250]:
hyperparams['finetuned'] = model1.get_params()
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df

Unnamed: 0,default,finetuned
bootstrap,True,True
ccp_alpha,0.0,0.0
criterion,squared_error,squared_error
max_depth,,
max_features,1.0,sqrt
max_leaf_nodes,,
max_samples,,
min_impurity_decrease,0.0,0.0
min_samples_leaf,1,1
min_samples_split,2,4


In [251]:
test_predictions = model.predict(scaled_features1)
test_predictions

ValueError: X has 266 features, but RandomForestRegressor is expecting 275 features as input.