In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# DataFrame'i içeri aktar
housing_df = pd.read_csv("ames_housing.csv")

In [5]:
# Eksik verisi olan sütunların ve eksik veri sayısının tespiti
print(housing_df.isna().sum()[housing_df.isna().sum() > 0].sort_values(ascending=False))

Pool QC           2917
Misc Feature      2824
Alley             2732
Fence             2358
Mas Vnr Type      1775
Fireplace Qu      1422
Lot Frontage       490
Garage Cond        159
Garage Qual        159
Garage Finish      159
Garage Yr Blt      159
Garage Type        157
Bsmt Exposure       83
BsmtFin Type 2      81
Bsmt Cond           80
Bsmt Qual           80
BsmtFin Type 1      80
Mas Vnr Area        23
Bsmt Half Bath       2
Bsmt Full Bath       2
BsmtFin SF 1         1
Garage Cars          1
Garage Area          1
Total Bsmt SF        1
Bsmt Unf SF          1
BsmtFin SF 2         1
Electrical           1
dtype: int64


In [6]:
# Eksik verisi olan sütunların veri tipleri
missing_cols = ['Pool QC', 'Misc Feature', 'Alley', 'Fence', 'Mas Vnr Type', 'Fireplace Qu',
                'Lot Frontage', 'Garage Yr Blt', 'Garage Cond', 'Garage Qual', 'Garage Finish', 'Garage Type',
                'Bsmt Exposure', 'BsmtFin Type 2', 'Bsmt Qual', 'BsmtFin Type 1', 'Bsmt Cond',
                'Mas Vnr Area', 'Bsmt Half Bath', 'Bsmt Full Bath', 'Electrical', 'Garage Cars',
                'Garage Area', 'Total Bsmt SF', 'Bsmt Unf SF', 'BsmtFin SF 2', 'BsmtFin SF 1']

print(housing_df[missing_cols].dtypes)


Pool QC            object
Misc Feature       object
Alley              object
Fence              object
Mas Vnr Type       object
Fireplace Qu       object
Lot Frontage      float64
Garage Yr Blt     float64
Garage Cond        object
Garage Qual        object
Garage Finish      object
Garage Type        object
Bsmt Exposure      object
BsmtFin Type 2     object
Bsmt Qual          object
BsmtFin Type 1     object
Bsmt Cond          object
Mas Vnr Area      float64
Bsmt Half Bath    float64
Bsmt Full Bath    float64
Electrical         object
Garage Cars       float64
Garage Area       float64
Total Bsmt SF     float64
Bsmt Unf SF       float64
BsmtFin SF 2      float64
BsmtFin SF 1      float64
dtype: object


In [7]:
# Eksik verisi olan sütunları numerical ve categorical olarak ayır
categorical_missing_cols = ['Pool QC', 'Misc Feature', 'Alley', 'Fence', 'Mas Vnr Type', 'Fireplace Qu',
                    'Garage Cond', 'Garage Qual', 'Garage Finish', 'Garage Type', 'Bsmt Exposure',
                    'BsmtFin Type 2', 'Bsmt Qual', 'BsmtFin Type 1', 'Bsmt Cond', 'Electrical']

numerical_missing_cols = ['Lot Frontage', 'Garage Yr Blt', 'Mas Vnr Area', 'Bsmt Half Bath', 'Bsmt Full Bath',
                  'Garage Cars', 'Garage Area', 'Total Bsmt SF', 'Bsmt Unf SF', 'BsmtFin SF 2', 'BsmtFin SF 1']

In [8]:
# Categorical sütunlardaki eksik verileri yani o özellik bulunmayan evlere 'None' de
for col in categorical_missing_cols:
    housing_df[col] = housing_df[col].fillna('None')

In [9]:
# Numerical sütunlarda o özellik olmayan evlere 0 de ve modelin özellik olup olmadığını öğrenmesi için flag sütunu ekle
for col in numerical_missing_cols:
    housing_df[col + '_missing'] = housing_df[col].isnull().astype(int)
    housing_df[col] = housing_df[col].fillna(0)

In [10]:
# Eksik verisi olan sütunların ve eksik veri sayısının tekrar kontrolü
print(housing_df.isna().sum()[housing_df.isna().sum() > 0].sort_values(ascending=False))

Series([], dtype: int64)


In [11]:
print(housing_df.dtypes.value_counts())

object     43
int64      39
float64    11
Name: count, dtype: int64


#### Split the data

In [12]:
from sklearn.model_selection import train_test_split

np.random.seed(42)

X = housing_df.drop("SalePrice", axis=1)
y = housing_df["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### OneHotEncoder ile categorical sütunları sayısal değerlere çevirme (Encoding)

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
# Categorical sütunları bul
categorical_cols = X_train.select_dtypes(include=['object']).columns
categorical_cols

Index(['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour',
       'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC',
       'Central Air', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature',
       'Sale Type', 'Sale Condition'],
      dtype='object')

In [15]:
# Build OneHotEncoder and fit the categorical columns
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(X_train[categorical_cols]);

In [16]:
# Hem train hem de test'teki kategorik sütunları dönüştür
X_train_ohe = pd.DataFrame(ohe.transform(X_train[categorical_cols]),
                                         columns=ohe.get_feature_names_out(categorical_cols),
                                         index=X_train.index)
X_test_ohe = pd.DataFrame(ohe.transform(X_test[categorical_cols]),
                          columns=ohe.get_feature_names_out(categorical_cols),
                          index=X_test.index)


In [17]:
# Numerical sütunları al
numerical_cols = X_train.drop(columns=categorical_cols).columns
X_train_numerical = X_train[numerical_cols]
X_test_numerical = X_test[numerical_cols]

In [18]:
# Numerical ve categorical sütunları birleştir
X_train_final = pd.concat([X_train_numerical, X_train_ohe], axis=1)
X_test_final = pd.concat([X_test_numerical, X_test_ohe], axis=1)

#### Model and Predictions

In [19]:
def evaluate_preds(model, X_train, X_test, y_train, y_test, y_pred):
    from sklearn.metrics import mean_squared_error
    score_train = model.score(X_train, y_train)
    score_test = model.score(X_test, y_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = float(np.sqrt(mse))
    print(f"Training score: {score_train}")
    print(f"Test score: {score_test}")
    print(f"Mean squared error: {mse}") 
    print(f"Root mean squared error: {rmse}")
    return score_train, score_test, mse, rmse

In [20]:
from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()

linear_regression.fit(X_train_final, y_train)

y_pred_lr = linear_regression.predict(X_test_final)
evaluate_preds(linear_regression, X_train_final, X_test_final, y_train, y_test, y_pred_lr)

Training score: 0.9406592987740783
Test score: 0.8941375726659082
Mean squared error: 848757085.2333205
Root mean squared error: 29133.435863854447


(0.9406592987740783, 0.8941375726659082, 848757085.2333205, 29133.435863854447)

#### Model Improving

In [21]:
# Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge = Ridge()

param_grid = {'alpha': [0.01, 0.1, 1, 10, 50, 100, 200]}

grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train_final, y_train)

print(f"Best params: {grid_search.best_params_}")
best_ridge = grid_search.best_estimator_

y_pred_ridge = best_ridge.predict(X_test_final)
evaluate_preds(best_ridge, X_train_final, X_test_final, y_train, y_test, y_pred_ridge)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ...........................................alpha=10; total time=   0.0s
[CV] END ...........................................alpha=10; total time=   0.0s
[CV] END ...........................................alpha=10; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END ...........................................alpha=10; total time=   0.0s
[CV] END ...........................................alpha=10; total time=   0.0s
[CV] END ...........................................alpha=50; total time=   0.0s
[CV] END ...........................................alpha=50; total time=   0.0s
[CV] END ...........................................alpha=50; total time=   0.0s
[CV] END ...........................................alpha=50; total time=   0.0s
[CV] END ...................

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


(0.9121080852471929, 0.8949228422343587, 842461148.8295858, 29025.18128848786)

In [22]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=42)
}

param_grids = {
    'Ridge': {'alpha': [0.1, 1, 10, 100]},
    'Lasso': {'alpha': [0.001, 0.01, 0.1, 1]},
    'ElasticNet': {'alpha': [0.001, 0.01, 0.1, 1], 'l1_ratio': [0.1, 0.5, 0.9]},
    'RandomForestRegressor': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    'GradientBoostingRegressor': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}
}

best_models = {}
best_models_score = {}

for name in models:
    print(f"Model: {name}")
    model = models[name]
    param_grid = param_grids[name]

    grid_search = GridSearchCV(model, param_grid, cv=5, verbose=2, scoring='neg_root_mean_squared_error', n_jobs=1)
    grid_search.fit(X_train_final, y_train)

    best_models[name] = grid_search.best_estimator_

    y_pred = best_models[name].predict(X_test_final)

    best_models_score[name] = evaluate_preds(best_models[name], X_train_final, X_test_final, y_train, y_test, y_pred)

Model: Ridge
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ...........................................alpha=10; total time=   0.0s
[CV] END ...........................

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

Training score: 0.9121080852471929
Test score: 0.8949228422343587
Mean squared error: 842461148.8295858
Root mean squared error: 29025.18128848786
Model: Lasso
Fitting 5 folds for each of 4 candidates, totalling 20 fits


  model = cd_fast.enet_coordinate_descent(


[CV] END ........................................alpha=0.001; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ........................................alpha=0.001; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ........................................alpha=0.001; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ........................................alpha=0.001; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ........................................alpha=0.001; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END .........................................alpha=0.01; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END .........................................alpha=0.01; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END .........................................alpha=0.01; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END .........................................alpha=0.01; total time=   0.7s


  model = cd_fast.enet_coordinate_descent(


[CV] END .........................................alpha=0.01; total time=   0.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................................alpha=0.1; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................................alpha=0.1; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................................alpha=0.1; total time=   0.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................................alpha=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................................alpha=0.1; total time=   0.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................................alpha=1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................................alpha=1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................................alpha=1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................................alpha=1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................................alpha=1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


Training score: 0.9406069632498871
Test score: 0.8950538397472947
Mean squared error: 841410869.990792
Root mean squared error: 29007.083100353128
Model: ElasticNet
Fitting 5 folds for each of 12 candidates, totalling 60 fits


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.5; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..........................alpha=0.001, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.1; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.1; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.5; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.5; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ..............................alpha=1, l1_ratio=0.9; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


Training score: 0.9108554081340865
Test score: 0.8948651557291336
Mean squared error: 842923653.1501939
Root mean squared error: 29033.14748955397
Model: RandomForestRegressor
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ...................max_depth=None, n_estimators=100; total time=   2.3s
[CV] END ...................max_depth=None, n_estimators=100; total time=   2.2s
[CV] END ...................max_depth=None, n_estimators=100; total time=   2.2s
[CV] END ...................max_depth=None, n_estimators=100; total time=   2.2s
[CV] END ...................max_depth=None, n_estimators=100; total time=   2.2s
[CV] END ...................max_depth=None, n_estimators=200; total time=   4.4s
[CV] END ...................max_depth=None, n_estimators=200; total time=   4.3s
[CV] END ...................max_depth=None, n_estimators=200; total time=   4.3s
[CV] END ...................max_depth=None, n_estimators=200; total time=   4.3s
[CV] END ...................max_dep

Exception ignored in: <function ResourceTracker.__del__ at 0x103754c20>
Traceback (most recent call last):
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x105a58c20>
Traceback (most recent call last):
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/umurmelikrona/Desktop/house price p

[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=15.5min
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=   2.4s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=   2.3s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=   2.3s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=   2.4s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=   2.3s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.7s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.7s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.7s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.7s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.7s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=200; total time=   1.4s
[CV] END ...learning_rate=0.

In [23]:
best_models_score

{'Ridge': (0.9121080852471929,
  0.8949228422343587,
  842461148.8295858,
  29025.18128848786),
 'Lasso': (0.9406069632498871,
  0.8950538397472947,
  841410869.990792,
  29007.083100353128),
 'ElasticNet': (0.9108554081340865,
  0.8948651557291336,
  842923653.1501939,
  29033.14748955397),
 'RandomForestRegressor': (0.9832933007300497,
  0.911110243940382,
  712677879.7746865,
  26696.027415604112),
 'GradientBoostingRegressor': (0.9746002600560987,
  0.9253284956512681,
  598682365.1889458,
  24467.986537288794)}

##### GridSearchCV

In [24]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

models = {
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=42)
}

param_grids = {
    'RandomForestRegressor': {'n_estimators': [100, 200, 500], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'max_features': ['auto', 'sqrt', 'log2', 0.3, 0.5]},
    'GradientBoostingRegressor': {'n_estimators': [100, 300], 'learning_rate': [0.01, 0.05], 'max_depth': [3, 5], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'max_features': [None, 'sqrt', 'log2'], 'subsample': [0.5, 0.7], 'loss': ['squared_error', 'huber', 'quantile'], 'alpha': [0.3, 0.75]}
}

best_models_grid = {}
best_models_score_grid = {}

for name in models:
    print(f"Model: {name}")
    model = models[name]
    param_grid = param_grids[name]

    grid_search = GridSearchCV(model, param_grid, cv=5, verbose=2, scoring='neg_root_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train_final, y_train)

    best_models_grid[name] = grid_search.best_estimator_

    y_pred = best_models_grid[name].predict(X_test_final)

    best_models_score_grid[name] = evaluate_preds(best_models_grid[name], X_train_final, X_test_final, y_train, y_test, y_pred)

Model: RandomForestRegressor
Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_sample

Exception ignored in: <function ResourceTracker.__del__ at 0x106e38c20>
Traceback (most recent call last):
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   1.3s
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 8.5min
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 8.5min
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 8.5min
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 8.5min
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time= 8.6min
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.3s
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   2.3s
[CV] END max_depth=None, max_fe

Exception ignored in: <function ResourceTracker.__del__ at 0x1116e8c20>
Traceback (most recent call last):
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=None, max_features=0.3, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   5.2s
[CV] END max_depth=None, max_features=0.5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.4s
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   4.6s
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   4.8s
[CV] END max_depth=None, max_features=0.5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.5s
[CV] END max_depth=None, max_features=0.5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.5s
[CV] END max_depth=None, max_features=0.5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.6s
[CV] END max_depth=None, max_features=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   9.2s
[CV] END max_depth=None, max_fea

Exception ignored in: <function ResourceTracker.__del__ at 0x1072acc20>
Traceback (most recent call last):
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, max_features=log2

Exception ignored in: <function ResourceTracker.__del__ at 0x107354c20>
Traceback (most recent call last):
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   2.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.7s
[CV] END max_depth=20, max_features=sqrt

180 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/site-packages/sklearn/base.py", line 436, in

Training score: 0.9800165503547963
Test score: 0.9127621722220303
Mean squared error: 699433465.5981491
Root mean squared error: 26446.804449652307
Model: GradientBoostingRegressor
Fitting 5 folds for each of 1152 candidates, totalling 5760 fits
[CV] END alpha=0.3, learning_rate=0.01, loss=squared_error, max_depth=3, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.5; total time=   1.0s
[CV] END alpha=0.3, learning_rate=0.01, loss=squared_error, max_depth=3, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.5; total time=   1.0s
[CV] END alpha=0.3, learning_rate=0.01, loss=squared_error, max_depth=3, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.5; total time=   1.0s
[CV] END alpha=0.3, learning_rate=0.01, loss=squared_error, max_depth=3, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.5; total time=   1.0s
[CV] END alpha

##### RandomizedSearchCV

In [25]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

models = {
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=42)
}

param_grids = {
    'RandomForestRegressor': {'n_estimators': [100, 200, 500], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'max_features': ['sqrt', 'log2', 0.3, 0.5]},
    'GradientBoostingRegressor': {'n_estimators': [100, 300, 500], 'learning_rate': [0.01, 0.05, 0.1], 'max_depth': [3, 5, 7], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'max_features': [None, 'sqrt', 'log2'], 'subsample': [0.5, 0.7], 'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'], 'alpha': [0.1, 0.5, 0.9]}
}

best_models_rs = {}
best_models_score_rs = {}

for name in models:
    print(f"Model: {name}")
    model = models[name]
    param_grid = param_grids[name]

    randomized_search = RandomizedSearchCV(model, param_grid, cv=5, verbose=2, scoring='neg_root_mean_squared_error', n_jobs=1, n_iter=50)
    randomized_search.fit(X_train_final, y_train)

    best_models_rs[name] = randomized_search.best_estimator_

    y_pred = best_models_rs[name].predict(X_test_final)

    best_models_score_rs[name] = evaluate_preds(best_models[name], X_train_final, X_test_final, y_train, y_test, y_pred)

Model: RandomForestRegressor
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   4.4s
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   4.3s
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   4.4s
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   4.3s
[CV] END max_depth=None, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   4.2s
[CV] END max_depth=10, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.2s
[CV] END max_depth=10, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.2s
[CV] END max_depth=10, max_features=0.3, min_samples_leaf=1, min_s

Exception ignored in: <function ResourceTracker.__del__ at 0x1072e8c20>
Traceback (most recent call last):
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x10752cc20>
Traceback (most recent call last):
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/umurmelikrona/Desktop/house price p

[CV] END max_depth=10, max_features=0.5, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   4.0s


Exception ignored in: <function ResourceTracker.__del__ at 0x102bc4c20>
Traceback (most recent call last):
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/umurmelikrona/Desktop/house price prediction/env/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=10, max_features=0.5, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   4.0s
[CV] END max_depth=10, max_features=0.5, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   3.9s
[CV] END max_depth=20, max_features=0.3, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END max_depth=20, max_features=0.3, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END max_depth=20, max_features=0.3, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END max_depth=20, max_features=0.3, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END max_depth=20, max_features=0.3, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END max_depth=10, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   2.7s
[CV] END max_depth=10, max_features=0.3, min_sam

In [26]:
print(f"Randomized Search: {best_models_rs}")
print(f"Randomized Search Score: {best_models_score_rs}")
print(f"Grid Search: {best_models_grid}")
print(f"Grid Search Score: {best_models_score_grid}")

Randomized Search: {'RandomForestRegressor': RandomForestRegressor(max_depth=20, max_features=0.3, min_samples_leaf=2,
                      random_state=42), 'GradientBoostingRegressor': GradientBoostingRegressor(alpha=0.5, max_depth=5, min_samples_leaf=2,
                          min_samples_split=5, n_estimators=300,
                          random_state=42, subsample=0.7)}
Randomized Search Score: {'RandomForestRegressor': (0.9832933007300497, 0.911110243940382, 694551034.5259296, 26354.336161738727), 'GradientBoostingRegressor': (0.9746002600560987, 0.9253284956512681, 550906825.6255206, 23471.404423798773)}
Grid Search: {'RandomForestRegressor': RandomForestRegressor(max_depth=20, max_features=0.3, min_samples_split=5,
                      n_estimators=500, random_state=42), 'GradientBoostingRegressor': GradientBoostingRegressor(alpha=0.3, learning_rate=0.05, min_samples_split=5,
                          n_estimators=300, random_state=42, subsample=0.7)}
Grid Search Score: {'

##### Best Model

In [27]:
from sklearn.ensemble import GradientBoostingRegressor

the_best_model = GradientBoostingRegressor(alpha=0.5, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=300, random_state=42, subsample=0.7, loss='quantile')

the_best_model.fit(X_train_final, y_train)

In [28]:
y_pred_best_model = the_best_model.predict(X_test_final)
evaluate_preds(the_best_model, X_train_final, X_test_final, y_train, y_test, y_pred_best_model)

Training score: 0.9765906579259088
Test score: 0.9265550456644174
Mean squared error: 588848441.6688651
Root mean squared error: 24266.199572015084


(0.9765906579259088, 0.9265550456644174, 588848441.6688651, 24266.199572015084)

In [29]:
import numpy as np
import pandas as pd

np.random.seed(42)

# Ames Housing dataset tam 93 sütunu (SalePrice dahil) - liste kaynaktan birebir alındı
columns_93 = [
    'Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
    'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config',
    'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
    'House Style', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add',
    'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
    'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
    'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
    'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
    'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
    '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
    'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr',
    'Kitchen Qual', 'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu',
    'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area',
    'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck SF', 'Open Porch SF',
    'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Pool QC',
    'Fence', 'Misc Feature', 'Misc Val', 'Mo Sold', 'Yr Sold', 'Sale Type',
    'Sale Condition', 'SalePrice',
    # Missing value indicator columns (common in this dataset)
    'Lot Frontage_missing', 'Garage Yr Blt_missing', 'Mas Vnr Area_missing',
    'Bsmt Half Bath_missing', 'Bsmt Full Bath_missing', 'Garage Cars_missing',
    'Garage Area_missing', 'Total Bsmt SF_missing', 'Bsmt Unf SF_missing',
    'BsmtFin SF 2_missing', 'BsmtFin SF 1_missing'
]

# Kategorik sütunlar listesi (Ames dataset tiplerinden alınmıştır)
categorical_cols = [
    'MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
    'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2',
    'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st',
    'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation',
    'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2',
    'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual',
    'Functional', 'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
    'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature', 'Sale Type',
    'Sale Condition'
]

# Sayısal sütunlar listesi (geri kalan)
numerical_cols = [col for col in columns_93 if col not in categorical_cols]

# Kategorik sütunlar için bazı olası değerler (örnek)
cat_values = {
    'MS Zoning': ['RL', 'RM', 'FV', 'RH', 'C (all)'],
    'Street': ['Pave', 'Grvl'],
    'Alley': ['Grvl', 'Pave', None],
    'Lot Shape': ['Reg', 'IR1', 'IR2', 'IR3'],
    'Land Contour': ['Lvl', 'Bnk', 'HLS', 'Low'],
    'Utilities': ['AllPub', 'NoSeWa', 'NoSewr'],
    'Lot Config': ['Inside', 'Corner', 'CulDSac', 'FR2', 'FR3'],
    'Land Slope': ['Gtl', 'Mod', 'Sev'],
    'Neighborhood': ['CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'Mitchel', 'Somerst', 'NWAmes', 'OldTown'],
    'Condition 1': ['Norm', 'Feedr', 'PosN', 'Artery', 'RRNn', 'RRAe'],
    'Condition 2': ['Norm', 'Feedr', 'PosN', 'Artery', 'RRNn', 'RRAe'],
    'Bldg Type': ['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'],
    'House Style': ['1Story', '2Story', '1.5Fin', 'SLvl', 'SFoyer', '2.5Unf'],
    'Roof Style': ['Gable', 'Hip', 'Flat', 'Gambrel', 'Mansard', 'Shed'],
    'Roof Matl': ['CompShg', 'Metal', 'WdShake', 'WdShngl', 'Membran'],
    'Exterior 1st': ['VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'Plywood', 'CemntBd'],
    'Exterior 2nd': ['VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'Plywood', 'CmentBd'],
    'Mas Vnr Type': ['None', 'BrkFace', 'Stone', 'BrkCmn'],
    'Exter Qual': ['TA', 'Gd', 'Ex', 'Fa'],
    'Exter Cond': ['TA', 'Gd', 'Ex', 'Fa'],
    'Foundation': ['PConc', 'CBlock', 'BrkTil', 'Wood', 'Slab'],
    'Bsmt Qual': ['TA', 'Gd', 'Ex', 'Fa', None],
    'Bsmt Cond': ['TA', 'Gd', 'Ex', 'Fa', None],
    'Bsmt Exposure': ['No', 'Mn', 'Av', 'Gd', None],
    'BsmtFin Type 1': ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', None],
    'BsmtFin Type 2': ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', None],
    'Heating': ['GasA', 'GasW', 'Grav', 'Wall', 'OthW', 'Floor'],
    'Heating QC': ['Ex', 'Gd', 'TA', 'Fa', 'Po'],
    'Central Air': ['Y', 'N'],
    'Electrical': ['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix'],
    'Kitchen Qual': ['TA', 'Gd', 'Ex', 'Fa'],
    'Functional': ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal'],
    'Fireplace Qu': ['Ex', 'Gd', 'TA', 'Fa', 'Po', None],
    'Garage Type': ['Attchd', 'Detchd', 'BuiltIn', 'CarPort', 'Basment', None],
    'Garage Finish': ['Fin', 'RFn', 'Unf', None],
    'Garage Qual': ['Ex', 'Gd', 'TA', 'Fa', 'Po', None],
    'Garage Cond': ['Ex', 'Gd', 'TA', 'Fa', 'Po', None],
    'Paved Drive': ['Y', 'P', 'N'],
    'Pool QC': ['Ex', 'Gd', 'TA', 'Fa', None],
    'Fence': ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', None],
    'Misc Feature': ['Elev', 'Gar2', 'Shed', 'TenC', None],
    'Sale Type': ['WD', 'New', 'COD', 'ConLD', 'ConLI', 'ConLw', 'CWD', 'VWD', 'Oth'],
    'Sale Condition': ['Normal', 'Abnorml', 'AdjLand', 'Alloca', 'Family', 'Partial'],
}

# Sayısal sütunlar için aralıklar (örnek)
num_ranges = {
    'Order': (1, 1460),
    'PID': (1000000, 1001460),
    'MS SubClass': [20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150, 160, 180, 190],
    'Lot Frontage': (20, 120),
    'Lot Area': (2000, 20000),
    'Overall Qual': (1, 10),
    'Overall Cond': (1, 10),
    'Year Built': (1872, 2010),
    'Year Remod/Add': (1950, 2010),
    'Mas Vnr Area': (0, 500),
    'BsmtFin SF 1': (0, 1500),
    'BsmtFin SF 2': (0, 1000),
    'Bsmt Unf SF': (0, 1000),
    'Total Bsmt SF': (300, 3000),
    '1st Flr SF': (300, 3000),
    '2nd Flr SF': (0, 1500),
    'Low Qual Fin SF': (0, 500),
    'Gr Liv Area': (300, 4000),
    'Bsmt Full Bath': (0, 3),
    'Bsmt Half Bath': (0, 2),
    'Full Bath': (0, 4),
    'Half Bath': (0, 2),
    'Bedroom AbvGr': (0, 8),
    'Kitchen AbvGr': (0, 3),
    'TotRms AbvGrd': (2, 15),
    'Fireplaces': (0, 4),
    'Garage Yr Blt': (1900, 2010),
    'Garage Cars': (0, 4),
    'Garage Area': (0, 1500),
    'Wood Deck SF': (0, 1000),
    'Open Porch SF': (0, 500),
    'Enclosed Porch': (0, 500),
    '3Ssn Porch': (0, 300),
    'Screen Porch': (0, 300),
    'Pool Area': (0, 800),
    'Misc Val': (0, 10000),
    'Mo Sold': (1, 12),
    'Yr Sold': (2006, 2010),
    'SalePrice': (50000, 500000),
    # Missing indicator columns sayısal olarak 0 ya da 1 olabilir, rastgele 0 veya 1 atanabilir:
    'Lot Frontage_missing': (0, 1),
    'Garage Yr Blt_missing': (0, 1),
    'Mas Vnr Area_missing': (0, 1),
    'Bsmt Half Bath_missing': (0, 1),
    'Bsmt Full Bath_missing': (0, 1),
    'Garage Cars_missing': (0, 1),
    'Garage Area_missing': (0, 1),
    'Total Bsmt SF_missing': (0, 1),
    'Bsmt Unf SF_missing': (0, 1),
    'BsmtFin SF 2_missing': (0, 1),
    'BsmtFin SF 1_missing': (0, 1),
}

# MS SubClass için olası değerler
ms_subclass_values = num_ranges['MS SubClass']

# Data dictionary hazırla
data = {col: [] for col in columns_93}

for i in range(3):
    for col in columns_93:
        if col in categorical_cols:
            vals = cat_values.get(col, [None])
            data[col].append(np.random.choice(vals))
        else:
            if col == 'MS SubClass':
                data[col].append(np.random.choice(ms_subclass_values))
            elif col in num_ranges:
                low, high = num_ranges[col]
                # Missing indicator sütunları için 0 ya da 1 rastgele
                if col.endswith('_missing'):
                    val = np.random.randint(0, 2)
                # Tam sayılar gereken sütunlar:
                elif col in ['Order', 'PID', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add',
                             'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
                             'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars',
                             'Mo Sold', 'Yr Sold']:
                    val = np.random.randint(low, high + 1)
                else:
                    val = np.round(np.random.uniform(low, high), 2)
                data[col].append(val)
            else:
                data[col].append(None)

new_houses = pd.DataFrame(data)
print(new_houses)



   Order      PID  MS SubClass MS Zoning  Lot Frontage  Lot Area Street Alley  \
0   1127  1001459          150        FV         97.97  12743.30   Grvl  None   
1   1368  1001152          120        FV         39.88  14804.16   Pave  Grvl   
2   1137  1000441           45        RH         45.18  10950.47   Pave  Grvl   

  Lot Shape Land Contour  ... Garage Yr Blt_missing Mas Vnr Area_missing  \
0       IR2          HLS  ...                     1                    0   
1       IR2          Lvl  ...                     1                    1   
2       Reg          Low  ...                     0                    1   

  Bsmt Half Bath_missing Bsmt Full Bath_missing Garage Cars_missing  \
0                      0                      0                   0   
1                      1                      0                   0   
2                      1                      1                   1   

  Garage Area_missing Total Bsmt SF_missing Bsmt Unf SF_missing  \
0                 

In [30]:
new_houses.isna().sum()[new_houses.isna().sum() > 0]

Alley             1
Bsmt Cond         2
BsmtFin Type 1    1
BsmtFin Type 2    1
Fireplace Qu      1
Garage Qual       1
Garage Cond       1
Fence             1
Misc Feature      2
dtype: int64

In [31]:
missing_cols_new = ['Alley', 'Bsmt Cond', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Fireplace Qu', 'Garage Qual', 'Garage Cond', 'Fence', 'Misc Feature']

In [32]:
new_houses[missing_cols_new].dtypes

Alley             object
Bsmt Cond         object
BsmtFin Type 1    object
BsmtFin Type 2    object
Fireplace Qu      object
Garage Qual       object
Garage Cond       object
Fence             object
Misc Feature      object
dtype: object

In [33]:
new_houses_splitted = new_houses.drop("SalePrice", axis=1)

In [34]:
categorical_missing_cols_new = ['Alley', 'Bsmt Cond', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Fireplace Qu', 'Garage Qual', 'Garage Cond', 'Fence', 'Misc Feature']

# Categorical sütunlardaki eksik verileri yani o özellik bulunmayan evlere 'None' de
for col in categorical_missing_cols_new:
    new_houses_splitted[col] = new_houses_splitted[col].fillna('None')

# Eksik verisi olan sütunların ve eksik veri sayısının tekrar kontrolü
print(new_houses_splitted.isna().sum()[new_houses_splitted.isna().sum() > 0].sort_values(ascending=False))

Series([], dtype: int64)


In [35]:
print(new_houses_splitted.dtypes.value_counts())

object     43
int64      30
float64    19
Name: count, dtype: int64


In [36]:
from sklearn.preprocessing import OneHotEncoder
'''
# Categorical sütunları bul
categorical_cols = X_train.select_dtypes(include=['object']).columns
categorical_cols

# Build OneHotEncoder and fit the categorical columns
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(X_train[categorical_cols]);
'''

# Hem train hem de test'teki kategorik sütunları dönüştür
new_houses_ohe = pd.DataFrame(ohe.transform(new_houses_splitted[categorical_cols]),
                                         columns=ohe.get_feature_names_out(categorical_cols),
                                         index=new_houses_splitted.index)

# Numerical sütunları al
numerical_cols = new_houses_splitted.drop(columns=categorical_cols).columns
new_houses_numerical = new_houses_splitted[numerical_cols]

# Numerical ve categorical sütunları birleştir
new_houses_final = pd.concat([new_houses_numerical, new_houses_ohe], axis=1)

In [37]:
new_houses_predictions = the_best_model.predict(new_houses_final)

In [38]:
new_houses_predictions

array([142512.21438589, 170067.35970507, 206744.11312214])

In [39]:
yeni_ev = X_test_final.iloc[[3]]

In [40]:
yeni_ev_prediction = the_best_model.predict(yeni_ev)

In [41]:
yeni_ev_prediction, y_test.iloc[3]

(array([129073.33020481]), np.int64(123600))