In [1]:
import pandas as pd

### Load Data and Perform General Exploration

In [2]:
# load data from csv
listings = pd.read_csv("iproperty_listing.csv.tar.gz", compression='gzip', low_memory=False)

In [3]:
# perform general exploration
listings.head()

Unnamed: 0,iproperty_listing.csv,kind,title,subtitle,tier,propertyType,color,transacted,price_type,price_currency,...,attr_transportDescription,attr_governmentWebsite,attr_minimumStay,attr_architectName,attr_contractorName,attr_projectType,attr_budgetRange,poi_education,poi_healthcare,poi_transportation
0,sale-8526600,property,"Taman Sri Pulai, Jempol",,1.0,1-sty Terrace/Link House,,,sale,MYR,...,,,,,,,,0.0,0.0,0.0
1,sale-8726821,property,"Pekan Mahsan, Jempol",,1.0,Residential Land,,,sale,MYR,...,,,,,,,,0.0,0.0,0.0
2,sale-8114392,property,"The Light Point, Gelugor",,1.0,Condominium,,,sale,MYR,...,,,,,,,,12.0,1.0,0.0
3,sale-8081290,property,"The Light Collection II, Gelugor",,1.0,Condominium,,,sale,MYR,...,,,,,,,,16.0,2.0,0.0
4,sale-7691638,property,"The Light Point, Gelugor",,1.0,Condominium,,,sale,MYR,...,,,,,,,,12.0,1.0,0.0


In [4]:
listings.isnull().sum().sort_values(ascending = False)

transacted                             68769
price_minPricePerSizeUnitByLandArea    68769
attr_buildYear                         68769
attr_developerName                     68769
attr_hasImage360                       68769
                                       ...  
state                                      1
city                                       1
poi_healthcare                             1
poi_education                              1
iproperty_listing.csv                      1
Length: 102, dtype: int64

In [5]:
listings.shape

(68769, 102)

### Drop Unuseful Columns

In [6]:
# drop all columns without any data
listings.dropna(how='all', axis=1, inplace=True)

In [7]:
listings.shape

(68769, 53)

In [8]:
listings.isnull().sum().sort_values(ascending = False)

price_maxPricePerSizeUnitByBuiltUp           68742
attr_completionStatus                        68738
price_minPricePerSizeUnitByBuiltUp           68723
attr_promotion                               68709
attr_projectLandEncumbrance                  68700
attr_projectApprovalAuthorityBuildingPlan    68685
attr_projectBuildingReferenceNo              68685
attr_projectAdvertisingPermitValidity        68685
attr_projectLicenseValidity                  68684
attr_projectAdvertisingPermit                68680
attr_highlight                               68646
attr_completionDate                          68603
attr_projectStage                            68586
color                                        68586
attr_bumiDiscount                            68586
attr_totalUnits                              68586
attr_maintenanceFee                          67267
attr_auctionDate                             66089
attr_maximumPricePerSizeUnit                 62780
attr_pricePSF                  

In [9]:
# drop columns which majority of rows contain no data
listings.drop(['price_maxPricePerSizeUnitByBuiltUp', 'attr_completionStatus',
       'price_minPricePerSizeUnitByBuiltUp', 'attr_promotion',
       'attr_projectLandEncumbrance',
       'attr_projectApprovalAuthorityBuildingPlan',
       'attr_projectBuildingReferenceNo',
       'attr_projectAdvertisingPermitValidity', 'attr_projectLicenseValidity',
       'attr_projectAdvertisingPermit', 'attr_highlight',
       'attr_completionDate', 'attr_projectStage', 'color',
       'attr_bumiDiscount', 'attr_totalUnits', 'attr_maintenanceFee',
       'attr_auctionDate', 'attr_maximumPricePerSizeUnit', 'attr_pricePSF',
       'attr_minimumPricePerSizeUnit'], axis=1, inplace=True)

In [10]:
listings.shape

(68769, 32)

### Fill Missing Categorial Data

In [11]:
# list columns and their null value count
listings.isnull().sum().sort_values(ascending = False)

attr_facingDirection     46812
township                 39130
attr_occupancy           33694
attr_landArea            32008
attr_unitType            30853
attr_carPark             26308
attr_titleType           23054
attr_landTitleType       21577
longitude                20495
latitude                 20494
attr_furnishing          17412
attr_builtUp             11033
attr_tenure              10117
attr_bathroom             4135
attr_bedroom              3860
attr_sizeUnit              185
attr_downloadUrl           184
price_min                    1
title                        1
tier                         1
propertyType                 1
kind                         1
price_type                   1
price_currency               1
poi_transportation           1
price_max                    1
formatedAddress              1
state                        1
city                         1
poi_healthcare               1
poi_education                1
iproperty_listing.csv        1
dtype: i

In [12]:
listings['attr_facingDirection'].value_counts()

Unknown      7674
North        3672
South        3458
East         1622
NorthEast    1561
SouthEast    1323
SouthWest    1157
NorthWest     912
West          578
Name: attr_facingDirection, dtype: int64

In [13]:
listings['attr_facingDirection'].fillna(value='Unknown', inplace=True)

In [14]:
listings['attr_occupancy'].value_counts()

Vacant            23185
Tenanted           5161
Owner Occupied     4406
Unknown            2323
Name: attr_occupancy, dtype: int64

In [15]:
listings['attr_occupancy'].fillna(value='Unknown', inplace=True)

In [16]:
listings['attr_unitType'].value_counts()

Intermediate    24625
Corner          10207
EndLot           2081
Duplex            412
Penthouse         286
Studio            197
SOHO               64
Triplex            44
Name: attr_unitType, dtype: int64

In [17]:
listings['attr_unitType'].fillna(value='Unknown', inplace=True)

In [18]:
listings['attr_titleType'].value_counts()

Individual    25961
Strata        18463
Unknown        1291
Name: attr_titleType, dtype: int64

In [19]:
listings['attr_titleType'].fillna(value='Unknown', inplace=True)

In [20]:
listings['attr_landTitleType'].value_counts()

Residential    43954
Commercial      2556
Unknown          471
Industrial       211
Name: attr_landTitleType, dtype: int64

In [21]:
listings['attr_landTitleType'].fillna(value='Unknown', inplace=True)

In [22]:
listings['attr_furnishing'].value_counts()

Partly Furnished    24380
Fully Furnished     13263
Unfurnished         12752
Unknown               962
Name: attr_furnishing, dtype: int64

In [23]:
listings['attr_furnishing'].fillna(value='Unknown', inplace=True)

In [24]:
listings['attr_tenure'].value_counts()

Freehold               42844
Leasehold              14985
Malay Reserved Land      474
Lease Tenancy            189
Unknown                  160
Name: attr_tenure, dtype: int64

In [25]:
listings['attr_tenure'].fillna(value='Unknown', inplace=True)

In [26]:
listings['attr_bedroom'].value_counts()

3        17241
4        14863
4+1       5484
3+1       5291
5         4375
         ...  
5 - 7        1
1 - 7        1
14+          1
4 - 7        1
13+1         1
Name: attr_bedroom, Length: 65, dtype: int64

In [27]:
listings['attr_sizeUnit'].value_counts()

SQUARE_FEET      67999
ACRES              323
SQUARE_METERS      251
HECTARE             11
Name: attr_sizeUnit, dtype: int64

### Clean Data With Inconsistent Format

In [28]:
listings['attr_landArea'].value_counts()

22x75      1162
20x70      1025
22x70       766
1400        744
1,400       500
           ... 
6652          1
56'X85'       1
36'x80'       1
8,746         1
3891          1
Name: attr_landArea, Length: 8945, dtype: int64

In [29]:
# mixed of categorial & numerical data, need to study on the data first before cleaning

In [30]:
listings['attr_builtUp'].value_counts()

2,000             1046
1,400              879
1,800              825
1,600              733
3,000              714
                  ... 
6,862                1
From 611 - 983       1
5,307                1
681                  1
5,587                1
Name: attr_builtUp, Length: 4926, dtype: int64

In [31]:
# mixed of categorial & numerical data, need to study on the data first before cleaning

In [32]:
listings['attr_carPark'].value_counts()

2        20806
1        11414
3         4058
4         3614
6         1104
5          726
8          209
10         206
7           82
0           52
20          33
30          25
12          19
1 - 2       18
2 - 3       17
9           16
15          13
1 - 3        9
2 - 4        9
11           6
14           4
13           3
17           3
16           2
1 - 4        2
18           2
2 - 6        2
28           1
3 - 4        1
1 - 5        1
3 - 6        1
4 - 6        1
0 - 2        1
23           1
Name: attr_carPark, dtype: int64

In [33]:
# mixed of categorial & numerical data, need to study on the data first before cleaning

In [34]:
listings['attr_bathroom'].value_counts()

2        22860
3        16764
4         8308
5         6054
1         4229
6         3661
7         1681
8          581
9          206
10          87
12          25
20          23
11          21
0           17
2 - 3       14
1 - 2       14
13          13
15          12
1 - 3        8
14           8
5 - 6        6
4 - 5        6
3 - 4        6
16           3
1 - 5        3
17           3
1 - 4        3
3 - 6        2
6 - 7        2
0 - 3        2
4 - 6        2
2 - 4        2
2 - 5        2
2 - 9        1
7 - 8        1
6 - 8        1
2 - 6        1
5 - 7        1
0 - 4        1
Name: attr_bathroom, dtype: int64

In [35]:
# listings['attr_bathroom'].fillna(value='Unknown', inplace=True)
# mixed of categorial & numerical data, need to study on the data first before cleaning

In [36]:
listings['attr_bedroom'].value_counts()

3        17241
4        14863
4+1       5484
3+1       5291
5         4375
         ...  
5 - 7        1
1 - 7        1
14+          1
4 - 7        1
13+1         1
Name: attr_bedroom, Length: 65, dtype: int64

In [37]:
# listings['attr_bedroom'].fillna(value='Unknown', inplace=True)
# mixed of categorial & numerical data, need to study on the data first before cleaning

### Drop Rows With Missing important Data

In [38]:
# drop rows with no size unit
listings.dropna(subset=['attr_sizeUnit'], inplace=True)

In [39]:
# drop rows with no data in both attr_landArea and attr_builtUp
listings.dropna(subset=['attr_landArea', 'attr_builtUp'], how='all', inplace=True)

In [40]:
# drop rows without latitude, longitude and township 
listings.dropna(subset=['latitude', 'longitude', 'township'], how='all', inplace=True)

In [41]:
listings.isnull().sum().sort_values(ascending = False)

attr_landArea            28301
township                 21129
attr_carPark             16622
attr_builtUp              5264
longitude                 2708
latitude                  2707
attr_bathroom             2230
attr_bedroom              2027
price_min                    0
state                        0
formatedAddress              0
price_currency               0
price_type                   0
propertyType                 0
tier                         0
title                        0
kind                         0
price_max                    0
poi_transportation           0
city                         0
poi_healthcare               0
attr_furnishing              0
attr_landTitleType           0
attr_tenure                  0
attr_facingDirection         0
attr_unitType                0
attr_occupancy               0
attr_titleType               0
attr_sizeUnit                0
attr_downloadUrl             0
poi_education                0
iproperty_listing.csv        0
dtype: i

In [42]:
listings.shape

(49947, 32)