In [1]:
import pandas as pd
df = pd.read_csv('housing.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [2]:
df['total_bedrooms'] = df['total_bedrooms'].fillna(df['total_bedrooms'].mean())

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

In [5]:
x = df.drop('median_house_value',axis=1)
y = df['median_house_value']

In [6]:
x_train, x_test, y_train, y_test, = train_test_split(x,y, test_size=0.25, random_state=42)

In [7]:
numeric = x_train.select_dtypes(include='float64').columns
categorical = x_train.select_dtypes(include='object').columns
print(numeric)
print(categorical)

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')
Index(['ocean_proximity'], dtype='object')


In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric),
        ('cut', OneHotEncoder(), categorical)
    ]
)

In [9]:
model = make_pipeline(preprocessor,RandomForestRegressor())

In [10]:
model.fit(x_train,y_train)

In [11]:
prediction = model.predict(x_test)

In [12]:
mse = mean_squared_error(y_test, prediction)
print(F"RMSE:{np.sqrt(mse)}")

RMSE:49012.651032871836


In [13]:
joblib.dump(model, 'house_price_prediction.joblib')

['house_price_prediction.joblib']

In [14]:
df1 = pd.DataFrame({
    'longitude':[-122.23]  ,
    'latitude':[37.86],
    'housing_median_age':[1],
    'total_rooms':[1000],
    'total_bedrooms':[200],
    'population':[100],
    'households':[400],
    'median_income':[2],  
    'ocean_proximity':['NEAR BAY']
})

In [15]:
loaded_model = joblib.load('house_price_prediction.joblib')

In [16]:
new_pred = loaded_model.predict(df1)
print('New house value:', new_pred[0])

New house value: 221584.0


In [18]:
results = pd.DataFrame(
    {
        'Actual price':y_test.values,
        'Prediction price':prediction,
        'Difference': abs(y_test.values - prediction)
    }
)

In [19]:
results.head(10)

Unnamed: 0,Actual price,Prediction price,Difference
0,47700.0,54016.0,6316.0
1,45800.0,71699.0,25899.0
2,500001.0,465977.37,34023.63
3,218600.0,254292.0,35692.0
4,278000.0,270619.0,7381.0
5,158700.0,165942.0,7242.0
6,198200.0,254496.05,56296.05
7,157500.0,163756.0,6256.0
8,340000.0,284040.02,55959.98
9,446600.0,479049.69,32449.69


In [21]:
features = model[0].get_feature_names_out()

In [23]:
importance = model[1].feature_importances_

In [24]:
imp_df = pd.DataFrame({
    'Feature':features,
    'Importance':importance
})

In [27]:
imp_df.head(10).sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
7,num__median_income,0.487928
9,cut__ocean_proximity_INLAND,0.141498
0,num__longitude,0.106725
1,num__latitude,0.101021
2,num__housing_median_age,0.052437
5,num__population,0.03278
3,num__total_rooms,0.024304
4,num__total_bedrooms,0.023849
6,num__households,0.017928
8,cut__ocean_proximity_<1H OCEAN,0.003663
