## Import packages and libraries

In [195]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()


## Read data and check null

In [196]:
df=pd.read_csv('../dataset/House Pricing.csv')

# Check for missing values
print("--------------------")
print("Missing values:")
print(df.isnull().sum())
print("--------------------")

#

--------------------
Missing values:
price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64
--------------------


### String to numerical

In [197]:

df=pd.read_csv('../dataset/House Pricing.csv')

df['mainroad'] = df['mainroad'].map({'yes': 1, 'no': 0})
df['guestroom'] = df['guestroom'].map({'yes': 1, 'no': 0})
df['basement'] = df['basement'].map({'yes': 1, 'no': 0})
df['hotwaterheating'] = df['hotwaterheating'].map({'yes': 1, 'no': 0})
df['airconditioning'] = df['airconditioning'].map({'yes': 1, 'no': 0})
df['prefarea'] = df['prefarea'].map({'yes': 1, 'no': 0})
df['furnishingstatus'] = df['furnishingstatus'].map({'unfurnished': 0, 'semi-furnished': 1, 'furnished': 2})
df.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,2
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,2
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,2
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,2


### Remove outliers

In [198]:
# #remove outliers
# z_scores = np.abs(stats.zscore(df))
# min_threshold = 2.5
# max_threshold = 3
# #print('Number of rows after removing outliers using 2.5 std dev: {}'.format(df[(z_scores < min_threshold).all(axis=1)].shape[0]))
# #print('Number of rows after removing outliers using 3 std dev: {}'.format(df[(z_scores < max_threshold).all(axis=1)].shape[0]))
# #print('Number of rows that are within std dev 2.5 to 3: {}'.format(abs(df[(z_scores < min_threshold).all(axis=1)].shape[0]- df[(z_scores < max_threshold).all(axis=1)].shape[0])))

# df = df[(z_scores < min_threshold).all(axis=1)]
# print(df.shape[0])

### Remove outliers (Method 2)

In [199]:

# # Calculate the first quartile (Q1) and third quartile (Q3)
# Q1 = df['price'].quantile(0.25)
# Q3 = df['price'].quantile(0.75)

# # Calculate the interquartile range (IQR)
# IQR = Q3 - Q1

# # Identify outliers
# lower_bound = Q1 - 1.5*IQR
# upper_bound = Q3 + 1.5*IQR
# outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]

# # Remove outliers
# df = df.drop(outliers.index)

### Normalize the numerical values

In [200]:
# from sklearn.discriminant_analysis import StandardScaler

# scaler = StandardScaler()
# num_cols = ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']
# df[num_cols] = scaler.fit_transform(df[num_cols])
# df.head()

### Normalize the numerical values (Method 2)

In [201]:
# df['id'] = [i for i in range(len(df))]
# id = df.pop('id')
# df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
# df= pd.concat([df, id.reset_index(drop=True)], axis=1)
# last_col = df.pop(df.columns[-1])
# df.insert(0, last_col.name, last_col)
# df.head()

### Normalize the numerical values (Method 3)

In [202]:
df.insert(0, 'id', range(len(df)))
id = df.pop('id')
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df = pd.concat([df_scaled, id.reset_index(drop=True)], axis=1)
last_col = df.pop(df.columns[-1])
df.insert(0, last_col.name, last_col)
df.head()

Unnamed: 0,id,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,0,1.0,0.396564,0.6,0.333333,0.666667,1.0,0.0,0.0,0.0,1.0,0.666667,1.0,1.0
1,1,0.909091,0.502405,0.6,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2,2,0.909091,0.571134,0.4,0.333333,0.333333,1.0,0.0,1.0,0.0,0.0,0.666667,1.0,0.5
3,3,0.906061,0.402062,0.6,0.333333,0.333333,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
4,4,0.836364,0.396564,0.6,0.0,0.333333,1.0,1.0,1.0,0.0,1.0,0.666667,0.0,1.0


In [203]:
df.to_csv('../dataset/HousePricingCleaned.csv', index=False)