In [2]:
import numpy as np
import pandas as pd


In [3]:
df = pd.read_csv("housing.csv")

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [7]:
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

numeric_features = X.drop("ocean_proximity", axis=1).columns
categorical_features = ["ocean_proximity"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [11]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01, max_iter=5000),
    "SVR (RBF)": SVR(kernel='rbf', C=100, epsilon=0.1),
    "KNN Regressor": KNeighborsRegressor(n_neighbors=5)
}

In [12]:
from sklearn.model_selection import cross_val_score
import numpy as np

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocess),
        ("model", model)
    ])

    scores = cross_val_score(pipe, X_train, y_train,
                             cv=5, scoring="neg_mean_squared_error")

    rmse = np.sqrt(-scores.mean())
    results[name] = rmse

results


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'Linear Regression': np.float64(68637.61649491833),
 'Ridge Regression': np.float64(68636.79430568301),
 'Lasso Regression': np.float64(68637.61639554241),
 'SVR (RBF)': np.float64(97538.44695935883),
 'KNN Regressor': np.float64(61736.06641453518)}

In [13]:
best_model_name = min(results, key=results.get)
print("Best Model:", best_model_name)
print("Best RMSE:", results[best_model_name])


Best Model: KNN Regressor
Best RMSE: 61736.06641453518


In [14]:
best_model = Pipeline([
    ("preprocess", preprocess),
    ("model", models[best_model_name])
])

best_model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [18]:
from sklearn.metrics import mean_squared_error

y_pred = best_model.predict(X_test)
test_rmse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(test_rmse)

print("Test RMSE:", test_rmse)
print(" RMSE:", rmse)


Test RMSE: 3758885714.474128
 RMSE: 61309.752197135225


In [19]:
import pickle

with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)
