#### Experiment 1 Data Preparation using Numpy and Pandas 
#### By Tanuj Bordikar

In [1]:
# import the pandas library
import pandas as pd
import numpy as np

In [2]:
# read csv
df = pd.read_csv("./Bengaluru_House_Data.csv") 

In [3]:
# To print no. of samples and attributes
print(df.shape)

(13320, 9)


In [4]:
# getting the columns of the dataset
columns = list(df.columns)
print(columns)

['area_type', 'availability', 'location', 'size', 'society', 'total_sqft', 'bath', 'balcony', 'price']


In [5]:
# To print first five samples
print(df.head()) 

              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

   society total_sqft  bath  balcony   price  
0  Coomee        1056   2.0      1.0   39.07  
1  Theanmp       2600   5.0      3.0  120.00  
2      NaN       1440   2.0      3.0   62.00  
3  Soiewre       1521   3.0      1.0   95.00  
4      NaN       1200   2.0      1.0   51.00  


In [6]:
#Describing dataset 
print(df.describe())

               bath       balcony         price
count  13247.000000  12711.000000  13320.000000
mean       2.692610      1.584376    112.565627
std        1.341458      0.817263    148.971674
min        1.000000      0.000000      8.000000
25%        2.000000      1.000000     50.000000
50%        2.000000      2.000000     72.000000
75%        3.000000      2.000000    120.000000
max       40.000000      3.000000   3600.000000


In [7]:
#New dataframe 
new_df = df
#new_df.isnull()
#Checking for null values 
print(new_df.isnull().sum()) 
print("Missing values distribution: ")
print(new_df.isnull().mean())
#print(new_df.shape) 
#new_df.duplicated()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64
Missing values distribution: 
area_type       0.000000
availability    0.000000
location        0.000075
size            0.001201
society         0.413063
total_sqft      0.000000
bath            0.005480
balcony         0.045721
price           0.000000
dtype: float64


In [8]:
# #Checking for duplicates
print(new_df.duplicated().any()) 
print(new_df.duplicated())
print(new_df.shape) 

True
0        False
1        False
2        False
3        False
4        False
         ...  
13315    False
13316    False
13317    False
13318    False
13319     True
Length: 13320, dtype: bool
(13320, 9)


In [9]:
print("Column datatypes: ")
print(new_df.dtypes)

Column datatypes: 
area_type        object
availability     object
location         object
size             object
society          object
total_sqft       object
bath            float64
balcony         float64
price           float64
dtype: object


In [10]:
#Finding unique values in "Area Type" 
print(new_df["area_type"].unique()) 

['Super built-up  Area' 'Plot  Area' 'Built-up  Area' 'Carpet  Area']


In [11]:
df['area_type'].value_counts()

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64

In [12]:
#Replacing string values with int
new_df['area_type'].replace({'Super built-up  Area':"0", 'Plot  Area':"1",
                                   'Built-up  Area':"2", 'Carpet  Area':"3"},
                            inplace = True)
print(new_df.head())

  area_type   availability                  location       size  society  \
0         0         19-Dec  Electronic City Phase II      2 BHK  Coomee    
1         1  Ready To Move          Chikka Tirupathi  4 Bedroom  Theanmp   
2         2  Ready To Move               Uttarahalli      3 BHK      NaN   
3         0  Ready To Move        Lingadheeranahalli      3 BHK  Soiewre   
4         0  Ready To Move                  Kothanur      2 BHK      NaN   

  total_sqft  bath  balcony   price  
0       1056   2.0      1.0   39.07  
1       2600   5.0      3.0  120.00  
2       1440   2.0      3.0   62.00  
3       1521   3.0      1.0   95.00  
4       1200   2.0      1.0   51.00  


In [13]:
#Describing data after preprocessing 
print(new_df.shape) 
print(new_df.head())

(13320, 9)
  area_type   availability                  location       size  society  \
0         0         19-Dec  Electronic City Phase II      2 BHK  Coomee    
1         1  Ready To Move          Chikka Tirupathi  4 Bedroom  Theanmp   
2         2  Ready To Move               Uttarahalli      3 BHK      NaN   
3         0  Ready To Move        Lingadheeranahalli      3 BHK  Soiewre   
4         0  Ready To Move                  Kothanur      2 BHK      NaN   

  total_sqft  bath  balcony   price  
0       1056   2.0      1.0   39.07  
1       2600   5.0      3.0  120.00  
2       1440   2.0      3.0   62.00  
3       1521   3.0      1.0   95.00  
4       1200   2.0      1.0   51.00  


In [14]:
#Finding unique values in "Area Type" 
print(new_df["society"].unique())

['Coomee ' 'Theanmp' nan ... 'SJovest' 'ThhtsV ' 'RSntsAp']


In [15]:
x = df["society"].mode()[0]
print(x)

GrrvaGr


In [16]:
df["society"].fillna(x, inplace = True)
print(df.to_string())

      area_type          availability                                            location        size  society         total_sqft  bath  balcony     price
0             0                19-Dec                            Electronic City Phase II       2 BHK  Coomee                1056   2.0      1.0    39.070
1             1         Ready To Move                                    Chikka Tirupathi   4 Bedroom  Theanmp               2600   5.0      3.0   120.000
2             2         Ready To Move                                         Uttarahalli       3 BHK  GrrvaGr               1440   2.0      3.0    62.000
3             0         Ready To Move                                  Lingadheeranahalli       3 BHK  Soiewre               1521   3.0      1.0    95.000
4             0         Ready To Move                                            Kothanur       2 BHK  GrrvaGr               1200   2.0      1.0    51.000
5             0         Ready To Move                                 

In [17]:
df2 = df.drop(['area_type','society','balcony',
               'availability'],axis='columns')
df2.shape

(13320, 5)