In [24]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

SEED = 42

In [25]:
df = pd.read_csv("Housing.csv")

In [26]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [27]:
#DATA CLEANING
#1. No NaN data in this dataframe
print("NaN presence in data")
print(df.isna().sum())

#2. Duplicates in the dataframe
df = df.drop_duplicates()
print("\nDropped duplicates in the data")

NaN presence in data
price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

Dropped duplicates in the data


In [28]:
#3. Converting categorical data into numerical data
# 2 common options to do this - Label Encoding (used when the order of categories matter) and One-Hot Encoding (used when order of categories does not matter)

cols_to_convert = ["mainroad","guestroom","basement","hotwaterheating","airconditioning","prefarea","furnishingstatus"]

one_hot_encoder = OneHotEncoder(sparse_output=False)
for col in cols_to_convert:
    encoded_col = one_hot_encoder.fit_transform(df[[col]]) #[[]] to make the input 2D
    encoded_df = pd.DataFrame(encoded_col, columns=one_hot_encoder.get_feature_names_out([col]), index=df.index)
    df = pd.concat([df.drop(col, axis=1), encoded_df], axis=1)

In [40]:
#4. Convert to X and y
X = df[['area','bedrooms','bathrooms','stories','parking','mainroad_no','mainroad_yes','guestroom_no','guestroom_yes',
       'basement_no','basement_yes','hotwaterheating_no','hotwaterheating_yes','airconditioning_no','airconditioning_yes',
       'prefarea_no','prefarea_yes','furnishingstatus_furnished',
       'furnishingstatus_semi-furnished','furnishingstatus_unfurnished']]

y = df['price']

In [41]:
#5. Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=SEED)

In [42]:
#6. OPTIONAL - could do a feature engineering to create new features or manipulate existing features (in this, data is quite clean)

#7. Feature scale/normalize the data to bring them to a similar scale.
# 2 common options - MinMaxScaling (scaling for a range [0,1]) and StandardScaler (Z-score scaling where 0 is mean + 1 SD)

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)