# Scraped data Feature Engineering
This section of code will look into how we can transform the features of the scraped data

### Import all necessary libraries

In [117]:
import pandas as pd
import os
import numpy as np

### Data Processing
- Import data into a pandas dataframe
- Take a look at the data


In [118]:
# Import the csv file as a pandas dataframe
path = os.getcwd()
scraped_data_path = os.path.join(path, "scraped_data.csv")
df = pd.read_csv(scraped_data_path)
print(df.shape)

(28813, 14)


In [119]:
df.head(5)

Unnamed: 0,Address,Property Name,Property Type,Bedrooms,Bathrooms,Asking Price,Size,PSF,Age,Tenure,No. of Units,District,Amenities,Link
0,2 Dunman Road (439188),Grand Dunman,Apartment,4+1,3,"$3,764,000","1,679 sqft","$2,242 psf",Not Available,LEASEHOLD/99 years,1008,D15 - East Coast / Marine Parade,"{'Primary Schools': [{'Kong Hwa': '0.41 km'}, ...",https://www.srx.com.sg/listings/101051671/for-...
1,Shelford Road,Shelford View,Condominium,5,4,"$10,000,000","5,134 sqft","$1,948 psf",41,FREEHOLD,20,D11 - Newton / Novena,"{'Primary Schools': [{""Raffles Girls' Primary""...",https://www.srx.com.sg/listings/101439641/for-...
2,Hillview Rise,Hillhaven,Apartment,2,2,"$1,387,352",678 sqft,"$2,046 psf",Not Available,LEASEHOLD/99 years,341,D23 - Dairy Farm / Bukit Panjang / Choa Chu Kang,{'Primary Schools': [{'Chij Our Lady Queen Of ...,https://www.srx.com.sg/listings/101369031/for-...
3,60H Kent Ridge Hill Residences (117321),Kent Ridge Hill Residences,Apartment,1,1,"$1,030,000 (Negotiable)",474 sqft,"$2,173 psf",1,LEASEHOLD/99 years,548,D5 - Buona Vista / West Coast / Clementi New Town,"{'Primary Schools': [], 'Secondary Schools': [...",https://www.srx.com.sg/listings/100989811/for-...
4,Marina Way,Marina One Residences,Apartment,1,1,"$1,630,000",700 sqft,"$2,329 psf",7,LEASEHOLD/99 years,1042,D1 - Boat Quay / Raffles Place / Marina,{'Primary Schools': [{'Cantonment Primary': '1...,https://www.srx.com.sg/listings/95119111/for-s...


### Conducting Feature Selection
Removing the Property Name and Link Column as it does not provide any useful information

In [126]:
# The Link column does not look useful as a feature
df.drop(['Link', 'Property Name'], axis=1)

Unnamed: 0,Address,Property Type,Bedrooms,Bathrooms,Asking Price,Size,PSF,Age,Tenure,No. of Units,District,Amenities
0,2 Dunman Road (439188),0,4+1,3,"$3,764,000","1,679 sqft","$2,242 psf",8,LEASEHOLD/99 years,1008,D15 - East Coast / Marine Parade,"{'Primary Schools': [{'Kong Hwa': '0.41 km'}, ..."
1,Shelford Road,1,5,4,"$10,000,000","5,134 sqft","$1,948 psf",41,FREEHOLD,20,D11 - Newton / Novena,"{'Primary Schools': [{""Raffles Girls' Primary""..."
2,Hillview Rise,0,2,2,"$1,387,352",678 sqft,"$2,046 psf",8,LEASEHOLD/99 years,341,D23 - Dairy Farm / Bukit Panjang / Choa Chu Kang,{'Primary Schools': [{'Chij Our Lady Queen Of ...
3,60H Kent Ridge Hill Residences (117321),0,1,1,"$1,030,000 (Negotiable)",474 sqft,"$2,173 psf",1,LEASEHOLD/99 years,548,D5 - Buona Vista / West Coast / Clementi New Town,"{'Primary Schools': [], 'Secondary Schools': [..."
4,Marina Way,0,1,1,"$1,630,000",700 sqft,"$2,329 psf",7,LEASEHOLD/99 years,1042,D1 - Boat Quay / Raffles Place / Marina,{'Primary Schools': [{'Cantonment Primary': '1...
...,...,...,...,...,...,...,...,...,...,...,...,...
28808,Meyer Mansion,0,3,2,"$3,580,000","1,109 sqft","$3,228 psf",8,FREEHOLD,200,D15 - East Coast / Marine Parade,{'Primary Schools': [{'Tanjong Katong Primary'...
28809,Jalan Tembusu,1,2+1,2,"$2,102,000",883 sqft,"$2,381 psf",8,LEASEHOLD/99 years,638,D15 - East Coast / Marine Parade,{'Primary Schools': [{'Tanjong Katong Primary'...
28810,The Oceanfront @ Sentosa Cove,1,4,5,"$7,980,000","3,300 sqft","$2,418 psf",14,LEASEHOLD/99 years,264,D4 - Sentosa / Harbourfront,"{'Primary Schools': [], 'Secondary Schools': [..."
28811,Thiam Siew Avenue,0,3,3,"$3,473,000","1,302 sqft","$2,667 psf",8,FREEHOLD,816,D15 - East Coast / Marine Parade,"{'Primary Schools': [{'Kong Hwa': '0.7 km'}, {..."


### Data Cleaning
- We shall encode the Property Types as int
- We shall perform imputation on the age column
- We shall drop any rows with NaN values

In [121]:
df['Property Type'] = df['Property Type'].replace({"Apartment": 0, "Condominium": 1})
df.replace("Not Available", np.nan, inplace=True)
df.head(5)

Unnamed: 0,Address,Property Name,Property Type,Bedrooms,Bathrooms,Asking Price,Size,PSF,Age,Tenure,No. of Units,District,Amenities,Link
0,2 Dunman Road (439188),Grand Dunman,0.0,4+1,3,"$3,764,000","1,679 sqft","$2,242 psf",,LEASEHOLD/99 years,1008,D15 - East Coast / Marine Parade,"{'Primary Schools': [{'Kong Hwa': '0.41 km'}, ...",https://www.srx.com.sg/listings/101051671/for-...
1,Shelford Road,Shelford View,1.0,5,4,"$10,000,000","5,134 sqft","$1,948 psf",41.0,FREEHOLD,20,D11 - Newton / Novena,"{'Primary Schools': [{""Raffles Girls' Primary""...",https://www.srx.com.sg/listings/101439641/for-...
2,Hillview Rise,Hillhaven,0.0,2,2,"$1,387,352",678 sqft,"$2,046 psf",,LEASEHOLD/99 years,341,D23 - Dairy Farm / Bukit Panjang / Choa Chu Kang,{'Primary Schools': [{'Chij Our Lady Queen Of ...,https://www.srx.com.sg/listings/101369031/for-...
3,60H Kent Ridge Hill Residences (117321),Kent Ridge Hill Residences,0.0,1,1,"$1,030,000 (Negotiable)",474 sqft,"$2,173 psf",1.0,LEASEHOLD/99 years,548,D5 - Buona Vista / West Coast / Clementi New Town,"{'Primary Schools': [], 'Secondary Schools': [...",https://www.srx.com.sg/listings/100989811/for-...
4,Marina Way,Marina One Residences,0.0,1,1,"$1,630,000",700 sqft,"$2,329 psf",7.0,LEASEHOLD/99 years,1042,D1 - Boat Quay / Raffles Place / Marina,{'Primary Schools': [{'Cantonment Primary': '1...,https://www.srx.com.sg/listings/95119111/for-s...


In [122]:
# Checking nan values
# Check for NaN values in each column
nan_counts_per_column = df.isna().sum()
print("NaN counts per column:")
print(nan_counts_per_column)

NaN counts per column:
Address             94
Property Name       94
Property Type       94
Bedrooms           195
Bathrooms          967
Asking Price       218
Size                96
PSF               2967
Age              11706
Tenure             239
No. of Units       826
District            94
Amenities            0
Link                 0
dtype: int64


In [123]:
# Handling nan values
# Imputation of NaN age values with the median age
median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)

# Removing all rows with any NaN values
df = df.dropna()
print(df.shape)

(24528, 14)


In [125]:
df['Bathrooms'] = df['Bathrooms'].astype(int)
df['Age'] = df['Age'].astype(int)
df['No. of Units'] = df['No. of Units'].astype(int)
df['Property Type'] = df['Property Type'].astype(int)
print(df.dtypes)

Address          object
Property Name    object
Property Type     int64
Bedrooms         object
Bathrooms         int64
Asking Price     object
Size             object
PSF              object
Age               int64
Tenure           object
No. of Units      int64
District         object
Amenities        object
Link             object
dtype: object
