In [455]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer

In [456]:
train_data = pd.read_csv('../ynov-data/train_housing_train.csv')
valid_data = pd.read_csv('../ynov-data/train_housing_valid.csv')

In [457]:
X = train_data.drop(['median_house_value', 'id'], axis=1)
y = train_data['median_house_value']

In [458]:
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['number']).columns

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('poly', PolynomialFeatures(degree=2, include_bias=False)),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder())
        ]), categorical_features)
    ])

In [459]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [460]:
param_grid = {
    'preprocessor__num__poly__degree': [1, 2, 3]
}
# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)
best_model = grid_search.best_estimator_
predictions = best_model.predict(X)

In [None]:
pipe.fit(X, y)
predictions = pipe.predict(X)

In [462]:
rmse = root_mean_squared_error(y, predictions)
r2 = r2_score(y, predictions)

In [463]:
print(f'RMSE: {rmse:.2f}')
print(f'R²: {r2:.2f}')

RMSE: 68305.50
R²: 0.65


In [464]:
model = pipe.named_steps['regressor']  # or whatever your model step is named
feature_names = pipe.named_steps['preprocessor'].get_feature_names_out()

coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': model.coef_})

print("\nCoefficients:")
print(coefficients)


Coefficients:
                                   Feature   Coefficient
0                           num__longitude  1.096910e+06
1                            num__latitude  1.911198e+06
2                  num__housing_median_age -9.513062e+05
3                         num__total_rooms  2.154121e+06
4                      num__total_bedrooms -1.887437e+06
5                          num__population  1.189744e+06
6                          num__households -1.289806e+06
7                       num__median_income -1.885241e+06
8                         num__longitude^2  1.763942e+06
9                  num__longitude latitude  3.263458e+06
10       num__longitude housing_median_age -1.390530e+06
11              num__longitude total_rooms  3.314067e+06
12           num__longitude total_bedrooms -3.036562e+06
13               num__longitude population  1.845284e+06
14               num__longitude households -1.892286e+06
15            num__longitude median_income -2.891720e+06
16              

In [465]:
y

0       113700.0
1       184900.0
2        69800.0
3       192600.0
4       225000.0
          ...   
9595    353500.0
9596    500001.0
9597    189100.0
9598    183400.0
9599     22500.0
Name: median_house_value, Length: 9600, dtype: float64

In [466]:
predictions

array([135186.04845763, 198415.93250309, 123278.77250423, ...,
       262480.02612339, 211081.36454597, 156018.93760437])

In [467]:
X_valid = valid_data.drop(['median_house_value', 'id'], axis=1)

# Use the pipeline to predict the test data
valid_predictions = pipe.predict(X_valid)
print(test_predictions)

[329633.54213886 246261.4599701  304466.3824701  ...  82798.39935614
  84803.81489611  95888.9380362 ]


In [468]:
test = pd.read_csv('../ynov-data/test_housing.csv')

In [469]:
X_test = test.drop('id', axis=1)

# Use the pipeline to predict the test data
test_predictions = pipe.predict(X_test)

In [470]:
test_predictions

array([329633.54213886, 246261.4599701 , 304466.3824701 , ...,
        82798.39935614,  84803.81489611,  95888.9380362 ])

In [471]:
submission = pd.read_csv('../ynov-data/submission.csv')
submission['median_house_value'] = test_predictions
submission.to_csv('../ynov-data/submission.csv', index=False)

In [472]:
submission

Unnamed: 0,id,median_house_value
0,3,329633.542139
1,10,246261.459970
2,11,304466.382470
3,12,239678.899977
4,13,210036.443088
...,...,...
8635,20635,67513.540840
8636,20636,103349.912721
8637,20637,82798.399356
8638,20638,84803.814896
