## Housing Price Predictor 
#### Learning by doing

In [None]:
# Import Necessary libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Import Dataset
data = pd.read_csv('C:\\Users\\Surface\\OneDrive\\Documentos\\GitHub\\House-Price-Prediction\\data\\Housing.csv')

In [None]:
# Basic Data Exploration
print(data.head())
print(data.info())
print(data.describe())

In [None]:
data.shape 

In [None]:
data.info()

In [None]:
print(data['furnishingstatus'].value_counts())

In [None]:
print(data['mainroad'].value_counts())

In [None]:
# Label Encoding Binary Columns
# List the 6 columns that are expected to be 'yes'/'no'
binary_cols = ['mainroad', 'guestroom', 'basement', 
               'hotwaterheating', 'airconditioning', 'prefarea']

# Apply the mapping using the Pandas .replace() method
data[binary_cols] = data[binary_cols].replace({'yes': 1, 'no': 0})

In [None]:
print(data['mainroad'].value_counts())

In [None]:
# One-Hot Encoding the Multiclass Column
# Apply One-Hot Encoding to the 'furnishingstatus' column
# drop_first=True is used to avoid multicollinearity
df = pd.get_dummies(data, columns=['furnishingstatus'], drop_first=True, dtype=int)
data = df.copy()
# Check the new shape and the new column names
print(data.shape)
print(data.head())

In [None]:
# Preparing Data for the Mode
# Create the target variable (y) - the variable we want to predict
y = df['price']

# Create the feature matrix (X) by dropping the target column
# axis=1 specifies that we are dropping a column
X = df.drop('price', axis=1)

# Check the shapes to confirm
print("Shape of X (Features):", X.shape)
print("Shape of y (Target):", y.shape)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Note: random_state=42 ensures the split is the same every time you run the code.

# Check the shapes to confirm the split
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
# --- INSERT THIS CODE AFTER train_test_split AND BEFORE StandardScaler ---

# 1. Calculate the necessary statistical quartiles
Q1 = y_train.quantile(0.25)
Q3 = y_train.quantile(0.75)
IQR = Q3 - Q1

# Define the upper boundary (1.5 * IQR is the standard outlier definition)
upper_limit = Q3 + 1.5 * IQR

# 2. Filter the training data to remove prices above the limit
# We create a new, filtered y_train
y_train_filtered = y_train[y_train < upper_limit]

# We must use the exact same index from the filtered target variable 
# to filter the corresponding rows from the feature matrix (X_train)
X_train_filtered = X_train.loc[y_train_filtered.index]

# Finally, update the variables to the new, filtered data
y_train = y_train_filtered
X_train = X_train_filtered

print("Outliers removed from training set.")
print(f"New X_train shape: {X_train.shape}")

In [None]:
# Feature Scaling: Standardization
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# 1. Fit the scaler ONLY on the training data (X_train) and transform it
X_train_scaled = scaler.fit_transform(X_train)

# 2. Transform the test data using the fitted scaler (DO NOT re-fit)
X_test_scaled = scaler.transform(X_test)

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)

In [None]:
# Create new log-transformed target variables
y_train= np.log(y_train)
y_test = np.log(y_test)

print("Target variables successfully log-transformed.")

In [None]:
# Model Training
# Initialize the Linear Regression model
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
lr = LinearRegression()

# Train the model using the scaled training data (X_train_scaled) 
# and the target prices (y_train)
lr.fit(X_train_scaled, y_train)

print("Model Training Complete.")

In [None]:
# Model Prediction
# Generate predictions for the unseen test data
y_pred = lr.predict(X_test_scaled)

print("Predictions generated and stored in 'y_pred'.")

In [None]:
# Model Evaluation
from sklearn.metrics import r2_score

# Calculate the R-squared score
r2 = r2_score(y_test, y_pred)

print(f"R-squared Score: {r2}")

In [None]:
from sklearn.metrics import mean_absolute_error

# Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np 

# 1. Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# 2. Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error (RMSE): {rmse}")