In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_csv('data.csv')
df = df.drop(columns=["date", "country"])

In [3]:
street_mean = df.groupby("street")["price"].mean()
df["street_encoded"] = df["street"].map(street_mean)
df = df.drop(columns=["street"])

In [4]:
df = pd.get_dummies(df, columns=["city", "statezip"], drop_first=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Columns: 133 entries, price to statezip_WA 98354
dtypes: bool(119), float64(5), int64(9)
memory usage: 1.0 MB


In [5]:
X = df.drop(columns=["price"])
y = df["price"]

In [6]:
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
import numpy as np

model = ElasticNet()

model_param = {
    'alpha': np.logspace(-5, 2, 8),
    'l1_ratio': [0.0, 0.1, 0.25, 0.5, 0.75, 1.0],
    'max_iter': [1000, 5000, 10000],
    'tol': [1e-3, 1e-4, 1e-5],
}


search_model = RandomizedSearchCV(
    estimator=model,
    param_distributions=model_param,
    cv=5,
    n_iter=20,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    scoring='neg_mean_squared_error'
)

In [9]:
search_model.fit(X_train, y_train)

best_model = search_model.best_estimator_
print("Scores :", search_model.best_score_)
print("Parameters :", search_model.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Scores : -2456154249.5994864
Parameters : {'tol': 0.0001, 'max_iter': 1000, 'l1_ratio': 1.0, 'alpha': np.float64(100.0)}


In [10]:
pred_y = best_model.predict(X_test)
print(f"Mean square error : {mean_squared_error(y_test, pred_y)}")
print(f"Mean absolute error : {mean_squared_error(y_test, pred_y)}")
print(f"r2_score : {r2_score(y_test, pred_y)}")

Mean square error : 484960796.4237648
Mean absolute error : 484960796.4237648
r2_score : 0.9995244766026395
