In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
 #Loading  the dataset
data = pd.read_csv("Housing.csv")

# Basic info about the dataset
print("Number of rows:", data.shape[0])
print("Number of columns:", data.shape[1])
print("\nData Info:")
print(data.info())


Number of rows: 545
Number of columns: 13

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB
None


In [4]:
print("\nMissing Values:\n", data.isnull().sum())


Missing Values:
 price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [5]:
# Fill missing values (if any)
# For numeric columns: fill with mean
for col in data.select_dtypes(include=np.number).columns:
    data[col] = data[col].fillna(data[col].mean())

# For categorical columns: fill with mode
for col in data.select_dtypes(include='object').columns:
    data[col] = data[col].fillna(data[col].mode()[0])


In [6]:
#  Encode categorical columns using LabelEncoder
le = LabelEncoder()
for col in data.select_dtypes(include='object').columns:
    data[col] = le.fit_transform(data[col])

print("\nAfter Encoding:")
print(data.head())


After Encoding:
      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

   furnishingstatus  
0                 0  
1                 0  
2                 1  
3                 0  
4                 0  


In [7]:
#  Scale numerical columns using StandardScaler
scaler = StandardScaler()
numeric_cols = data.select_dtypes(include=np.number).columns
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

print("\nAfter Scaling:")
print(data.head())


After Scaling:
      price      area  bedrooms  bathrooms   stories  mainroad  guestroom  \
0  4.566365  1.046726  1.403419   1.421812  1.378217  0.405623  -0.465315   
1  4.004484  1.757010  1.403419   5.405809  2.532024  0.405623  -0.465315   
2  4.004484  2.218232  0.047278   1.421812  0.224410  0.405623  -0.465315   
3  3.985755  1.083624  1.403419   1.421812  0.224410  0.405623  -0.465315   
4  3.554979  1.046726  1.403419  -0.570187  0.224410  0.405623   2.149083   

   basement  hotwaterheating  airconditioning   parking  prefarea  \
0 -0.734539        -0.219265         1.472618  1.517692  1.804941   
1 -0.734539        -0.219265         1.472618  2.679409 -0.554035   
2  1.361397        -0.219265        -0.679063  1.517692  1.804941   
3  1.361397        -0.219265         1.472618  2.679409  1.804941   
4  1.361397        -0.219265         1.472618  1.517692 -0.554035   

   furnishingstatus  
0         -1.406286  
1         -1.406286  
2         -0.091662  
3         -1.40628

In [8]:
# Save cleaned dataset
data.to_csv("Cleaned_Housing.csv", index=False)
print("\n Cleaned dataset saved as 'Cleaned_Housing.csv'")


 Cleaned dataset saved as 'Cleaned_Housing.csv'
