<a href="https://colab.research.google.com/github/vvrgit/AI/blob/main/DataProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Import required libraries

In [2]:
import numpy as np
import math as mt
import random
import pandas as pd

# Load Data

In [3]:
housing = pd.read_csv("/content/drive/MyDrive/AI Data/Housing.csv")
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


# Checking for null values

In [4]:
print(housing.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB
None


In [5]:
print(housing.describe())

              price          area    bedrooms   bathrooms     stories  \
count  5.450000e+02    545.000000  545.000000  545.000000  545.000000   
mean   4.766729e+06   5150.541284    2.965138    1.286239    1.805505   
std    1.870440e+06   2170.141023    0.738064    0.502470    0.867492   
min    1.750000e+06   1650.000000    1.000000    1.000000    1.000000   
25%    3.430000e+06   3600.000000    2.000000    1.000000    1.000000   
50%    4.340000e+06   4600.000000    3.000000    1.000000    2.000000   
75%    5.740000e+06   6360.000000    3.000000    2.000000    2.000000   
max    1.330000e+07  16200.000000    6.000000    4.000000    4.000000   

          parking  
count  545.000000  
mean     0.693578  
std      0.861586  
min      0.000000  
25%      0.000000  
50%      0.000000  
75%      1.000000  
max      3.000000  


#Binary Conversion

In [6]:
# Converting the categorical variable into numerical
varlist =  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

# Defining the map function
def binary_map(x):
    return x.map({'yes': 1, "no": 0})

# Applying the function to the housing list
housing[varlist] = housing[varlist].apply(binary_map)

# Check the housing dataframe now
housing

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,0,2,0,unfurnished
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,semi-furnished
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,unfurnished
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,furnished


#Categorical Conversion

In [7]:
# Dropping the first column from status dataset
status = pd.get_dummies(housing['furnishingstatus'], drop_first = True)

# Adding the status to the original housing dataframe
housing = pd.concat([housing, status], axis = 1)

# Dropping 'furnishingstatus' as we have created the dummies for it
housing.drop(['furnishingstatus'], axis = 1, inplace = True)

housing

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,0,2,0,0,1
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,1,0
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,0,1
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,0,0


#Outliers Detection

In [8]:
def detect_outliers_zscore(data):
    outliers = []
    thres = 3
    mean = np.mean(data)
    std = np.std(data)
    #print(mean, std)
    for i in data:
        #print(i)
        z_score = (i-mean)/std
        if (np.abs(z_score) > thres):
            outliers.append(i)
    return outliers# Driver code

In [9]:
sample_outliers_price = detect_outliers_zscore(housing['price'])
sample_outliers_area = detect_outliers_zscore(housing['area'])
sample_outliers_bedrooms = detect_outliers_zscore(housing['bedrooms'])
print("Outliers in price column from Z-scores method: ", sample_outliers_price)
print("Outliers in area column from Z-scores method: ", sample_outliers_area)
print("Outliers in bedrooms column from Z-scores method: ", sample_outliers_bedrooms)

Outliers in price column from Z-scores method:  [13300000, 12250000, 12250000, 12215000, 11410000, 10850000]
Outliers in area column from Z-scores method:  [16200, 13200, 13200, 12090, 15600, 12900, 12944]
Outliers in bedrooms column from Z-scores method:  [6, 6]


In [10]:
from scipy.stats import zscore
z1 = np.abs(zscore(housing['price']))
z2 = np.abs(zscore(housing['area']))
z3 = np.abs(zscore(housing['bedrooms']))
housing['Zscore_price'] = z1
housing['Zscore_area'] = z2
housing['Zscore_bedrooms'] = z3
outliers_price=len(housing[housing['Zscore_price']>3])
outliers_area=len(housing[housing['Zscore_area']>3])
outliers_bed=len(housing[housing['Zscore_bedrooms']>3])
data_z = housing[housing['Zscore_price']<3]
data_z = housing[housing['Zscore_area']<3]
data_z = housing[housing['Zscore_bedrooms']<3]
housing=data_z.drop(['Zscore_price', 'Zscore_area', 'Zscore_bedrooms'], axis=1)
housing

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,0,2,0,0,1
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,1,0
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,0,1
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,0,0


#Missing Values Treatment

In [11]:
print(housing.isnull().sum())

price              0
area               0
bedrooms           0
bathrooms          0
stories            0
mainroad           0
guestroom          0
basement           0
hotwaterheating    0
airconditioning    0
parking            0
prefarea           0
semi-furnished     0
unfurnished        0
dtype: int64


In [12]:
m1=housing['price'].mean()
m2=housing['area'].mean()
m3=housing['bedrooms'].mean()
housing['price'].fillna(value=m1, inplace=True)
housing['area'].fillna(value=m2, inplace=True)
housing['bedrooms'].fillna(value=m3, inplace=True)
housing

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,0,2,0,0,1
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,1,0
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,0,1
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,0,0


# Data Normalization

In [16]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
housing_norm = scaler.fit_transform(housing)
housing_norm

array([[1.        , 0.39656357, 0.75      , ..., 1.        , 0.        ,
        0.        ],
       [0.90909091, 0.5024055 , 0.75      , ..., 0.        , 0.        ,
        0.        ],
       [0.90909091, 0.57113402, 0.5       , ..., 1.        , 1.        ,
        0.        ],
       ...,
       [0.        , 0.13539519, 0.25      , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.08659794, 0.5       , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.15120275, 0.5       , ..., 0.        , 0.        ,
        1.        ]])

#Data Split

In [23]:
from sklearn.model_selection import train_test_split
dataframe = housing.values
X, y = housing_norm[:,1:], housing_norm[:, 0]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 10)
x_train

array([[0.30584192, 0.5       , 0.33333333, ..., 1.        , 0.        ,
        0.        ],
       [0.13608247, 0.5       , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.22749141, 0.75      , 0.        , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.15945017, 0.5       , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.37869416, 0.5       , 0.        , ..., 1.        , 0.        ,
        1.        ],
       [0.22199313, 0.5       , 0.        , ..., 1.        , 0.        ,
        1.        ]])

# **Thank You**