## Housing Price Predictor 
#### Learning by doing

In [1]:
# Import Necessary libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Import Dataset
data = pd.read_csv('C:\\Users\\Surface\\OneDrive\\Documentos\\GitHub\\House-Price-Prediction\\data\\Housing.csv')

In [3]:
# Basic Data Exploration
print(data.head())
print(data.info())
print(data.describe())

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 colu

In [4]:
data.shape 

(545, 13)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [6]:
print(data['furnishingstatus'].value_counts())

furnishingstatus
semi-furnished    227
unfurnished       178
furnished         140
Name: count, dtype: int64


In [7]:
print(data['mainroad'].value_counts())

mainroad
yes    468
no      77
Name: count, dtype: int64


In [8]:
# Label Encoding Binary Columns
# List the 6 columns that are expected to be 'yes'/'no'
binary_cols = ['mainroad', 'guestroom', 'basement', 
               'hotwaterheating', 'airconditioning', 'prefarea']

# Apply the mapping using the Pandas .replace() method
data[binary_cols] = data[binary_cols].replace({'yes': 1, 'no': 0})

  data[binary_cols] = data[binary_cols].replace({'yes': 1, 'no': 0})


In [9]:
print(data['mainroad'].value_counts())

mainroad
1    468
0     77
Name: count, dtype: int64


In [10]:
# One-Hot Encoding the Multiclass Column
# Apply One-Hot Encoding to the 'furnishingstatus' column
# drop_first=True is used to avoid multicollinearity
df = pd.get_dummies(data, columns=['furnishingstatus'], drop_first=True, dtype=int)
data = df.copy()
# Check the new shape and the new column names
print(data.shape)
print(data.head())

(545, 14)
      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

   furnishingstatus_semi-furnished  furnishingstatus_unfurnished  
0                                0                             0  
1             

In [11]:
# Preparing Data for the Mode
# Create the target variable (y) - the variable we want to predict
y = df['price']

# Create the feature matrix (X) by dropping the target column
# axis=1 specifies that we are dropping a column
X = df.drop('price', axis=1)

# Check the shapes to confirm
print("Shape of X (Features):", X.shape)
print("Shape of y (Target):", y.shape)

Shape of X (Features): (545, 13)
Shape of y (Target): (545,)


In [12]:
from sklearn.model_selection import train_test_split

# Split the data (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Note: random_state=42 ensures the split is the same every time you run the code.

# Check the shapes to confirm the split
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (381, 13)
X_test shape: (164, 13)
y_train shape: (381,)
y_test shape: (164,)


In [13]:
# --- INSERT THIS CODE AFTER train_test_split AND BEFORE StandardScaler ---

# 1. Calculate the necessary statistical quartiles
Q1 = y_train.quantile(0.25)
Q3 = y_train.quantile(0.75)
IQR = Q3 - Q1

# Define the upper boundary (1.5 * IQR is the standard outlier definition)
upper_limit = Q3 + 1.5 * IQR

# 2. Filter the training data to remove prices above the limit
# We create a new, filtered y_train
y_train_filtered = y_train[y_train < upper_limit]

# We must use the exact same index from the filtered target variable 
# to filter the corresponding rows from the feature matrix (X_train)
X_train_filtered = X_train.loc[y_train_filtered.index]

# Finally, update the variables to the new, filtered data
y_train = y_train_filtered
X_train = X_train_filtered

print("Outliers removed from training set.")
print(f"New X_train shape: {X_train.shape}")

Outliers removed from training set.
New X_train shape: (371, 13)


In [14]:
# Feature Scaling: Standardization
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# 1. Fit the scaler ONLY on the training data (X_train) and transform it
X_train_scaled = scaler.fit_transform(X_train)

# 2. Transform the test data using the fitted scaler (DO NOT re-fit)
X_test_scaled = scaler.transform(X_test)

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)

X_train_scaled shape: (371, 13)
X_test_scaled shape: (164, 13)


In [15]:
# Create new log-transformed target variables
y_train= np.log(y_train)
y_test = np.log(y_test)

print("Target variables successfully log-transformed.")

Target variables successfully log-transformed.


In [16]:
# Model Training
# Initialize the Linear Regression model
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
lr = LinearRegression()

# Train the model using the scaled training data (X_train_scaled) 
# and the target prices (y_train)
lr.fit(X_train_scaled, y_train)

print("Model Training Complete.")

Model Training Complete.


In [17]:
# Model Prediction
# Generate predictions for the unseen test data
y_pred = lr.predict(X_test_scaled)

print("Predictions generated and stored in 'y_pred'.")

Predictions generated and stored in 'y_pred'.


In [18]:
# Model Evaluation
from sklearn.metrics import r2_score

# Calculate the R-squared score
r2 = r2_score(y_test, y_pred)

print(f"R-squared Score: {r2}")

R-squared Score: 0.6570669612998578


In [19]:
from sklearn.metrics import mean_absolute_error

# Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")

Mean Absolute Error (MAE): 0.19063495643222306


In [20]:
from sklearn.metrics import mean_squared_error
import numpy as np 

# 1. Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# 2. Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 0.2439033204531407


In [21]:
# # Hyperparameter Tuning with GridSearchCV
# from sklearn.linear_model import Ridge
# from sklearn.model_selection import GridSearchCV

# # parameter grid dictionary
# param_grid = {
#     'alpha': [0.01, 0.1, 1.0, 10.0, 100.0] 
# }



In [22]:
# # Initializing the Model
# ridge = Ridge()

# # Setting up GridSearchCV
# #grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, 
# #                           scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

# ridge_tuner = GridSearchCV(
#     estimator=ridge,
#     param_grid=param_grid,
#     cv=5
# )

In [23]:
# # Hyperparameter Tuning with Grid Search
# # Start the search process: Grid Search will train 5 models for each of the 5 alpha values.
# ridge_tuner.fit(X_train_scaled, y_train)

In [24]:
# # Find the best alpha value and its score
# print(f"Best Hyperparameters (alpha): {ridge_tuner.best_params_}")
# print(f"Best Cross-Validation Score (R^2): {ridge_tuner.best_score_}")

In [25]:
# ned_log = best_ridge_model.predict(X_test_scaled)

# # 3. CRITICAL: Reverse transformation to get prices in currency
# y_pred_tuned = np.exp(y_pred_tuned_log) 

# # 4. Calculate Final Metrics (Comparing original y_test to reversed y_pred_tuned)
# from sklearn.metrics import r2_score, mean_squared_error
# import numpy as np

# r2_final = r2_score(y_test, y_pred_tuned)
# rmse_final = np.sqrt(mean_squared_error(y_test, y_pred_tuned))

# print(f"Final R-squared Score (Tuned Model): {r2_final}")
# print(f"Final RMSE (Tuned Mo# 1. Select the best model
# best_ridge_model = ridge_tuner.best_estimator_

# # 2. Predict prices (output is in log units)
# y_pred_tudel): {rmse_final}")

In [26]:
# Save the model
joblib.dump(lr, 'house_price_model.pkl')


['house_price_model.pkl']

In [27]:
import joblib
joblib.dump(scaler, 'scaler_transform.pkl')

['scaler_transform.pkl']