In [17]:
# import
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import re

In [18]:
# import data
df = pd.read_csv("..\\data\\04_ml_prep_data_drop_duplicate.csv", index_col="ID")
df.shape

(16292, 23)

In [19]:
# property type
df["typeNum"] = df["type"].apply(lambda x: 1 if x == "HOUSE" else 0)
df.groupby(["typeNum"])["typeNum"].count()

typeNum
0    8280
1    8012
Name: typeNum, dtype: int64

In [20]:
# # region to dummies
region_dummies = pd.get_dummies(df[["region"]], prefix="region", prefix_sep="_", dtype=float)
region_dummies.shape

(16292, 3)

In [21]:
# province
province_dummies = pd.get_dummies(df[["province"]], prefix="province", prefix_sep="_", dtype=float)
province_dummies.shape

(16292, 11)

In [22]:
# district
district_dummies = pd.get_dummies(df[["district"]], prefix="district", prefix_sep="_", dtype=float)
district_dummies.shape

(16292, 43)

In [23]:
# postal codes
"""postalCode_dummies = pd.get_dummies(
    df[["postalCode"]].astype(str), 
    prefix="pCode", prefix_sep="_", dtype=float
    )
postalCode_dummies.shape"""

'postalCode_dummies = pd.get_dummies(\n    df[["postalCode"]].astype(str), \n    prefix="pCode", prefix_sep="_", dtype=float\n    )\npostalCode_dummies.shape'

In [24]:
# postal codes dummies
df["postalCode2dg"] = df["postalCode"].apply(lambda x: str(x)[0:2]+"00")
#df[["postalCode", "postalCode2dg"]].head()
postalCode2dg_dummies = pd.get_dummies(df[["postalCode2dg"]], prefix="pCode", prefix_sep="_", dtype=float)
postalCode2dg_dummies.shape


(16292, 80)

In [25]:
# postal codes dummies
df["postalCode3dg"] = df["postalCode"].apply(
    lambda x: str(x)[0:2]+"10" if re.match(r'^[1-9]{3}', str(x)) else str(x)[0:2]+"00"
    )
#df[["postalCode", "postalCode3dg"]].head()
#df.groupby("postalCode3dg")["type"].count()
postalCode3dg_dummies = pd.get_dummies(df[["postalCode3dg"]], prefix="pCode", prefix_sep="_", dtype=float)
postalCode3dg_dummies.shape


(16292, 145)

In [26]:
# locality type
localityType_dummies = pd.get_dummies(
    df[["localityType"]].astype(str), 
    prefix="lType", prefix_sep="_", dtype=float)
localityType_dummies.shape

(16292, 4)

In [27]:
# condition
df["conditionNum"] = None
df.loc[df["condition"] == "AS_NEW", "conditionNum"] = 1
df.loc[df["condition"] == "JUST_RENOVATED", "conditionNum"] = 2
df.loc[df["condition"] == "GOOD", "conditionNum"] = 3
df.loc[df["condition"] == "TO_BE_DONE_UP", "conditionNum"] = 4
df.loc[df["condition"] == "TO_RENOVATE", "conditionNum"] = 5
df.loc[df["condition"] == "TO_RESTORE", "conditionNum"] = 6
df.loc[df["condition"] == "UNKNOWN", "conditionNum"] = 0
#df.groupby("conditionNum")["type"].count()

In [28]:
# epcScore
df["epcScoreNum"] = None
df.loc[df["epcScore"] == "A", "epcScoreNum"] = 1
df.loc[df["epcScore"] == "B", "epcScoreNum"] = 2
df.loc[df["epcScore"] == "C", "epcScoreNum"] = 3
df.loc[df["epcScore"] == "D", "epcScoreNum"] = 4
df.loc[df["epcScore"] == "E", "epcScoreNum"] = 5
df.loc[df["epcScore"] == "F", "epcScoreNum"] = 6
df.loc[df["epcScore"] == "G", "epcScoreNum"] = 7
df.loc[df["epcScore"] == "UNKNOWN", "epcScoreNum"] = 0
df.groupby("epcScoreNum")["type"].count()

epcScoreNum
0    3067
1    1261
2    3149
3    2653
4    2124
5    1464
6    1647
7     927
Name: type, dtype: int64

In [29]:
# sampling
to_scale = [
    "netHabitableSurface", "bedroomCount", 
    "bathroomCount", "showerRoomCount", "toiletCount", 
    "gardenSurface", "terraceSurface", "land"
    ]
scaled = [
    "sc_netHabitableSurface", "sc_bedroomCount", 
    "sc_bathroomCount", "sc_showerRoomCount", "sc_toiletCount", 
    "sc_gardenSurface", "sc_terraceSurface", "sc_land"
    ]

minMaxScaler = MinMaxScaler()
df[scaled] = minMaxScaler.fit_transform(df[to_scale])

In [30]:
subset = [
    # num or converted to num
    "typeNum", 
    # scaled
    "sc_netHabitableSurface", "sc_bedroomCount", 
    "sc_bathroomCount", "sc_showerRoomCount", "sc_toiletCount", 
    # num or converted to num
    "conditionNum", "epcScoreNum",
    # boolean (as num)
    "hasLift", "fireplaceExists", "hasSwimmingPool", "hasAirConditioning", "hasGarden", "hasTerrace",
    # scaled
    "sc_gardenSurface", "sc_terraceSurface", "sc_land",
    # as is
    "price"
    ]
df = df[subset]

In [31]:
# merge dummies data
df = df.merge(region_dummies, on="ID")
df = df.merge(province_dummies, on="ID")
df = df.merge(district_dummies, on="ID")
df = df.merge(localityType_dummies, on="ID")
#df = df.merge(postalCode2dg_dummies, on="ID")
#df = df.merge(postalCode3dg_dummies, on="ID")

In [32]:
#   save new csv
path_to_save = "..\\data\\05_ml_data.csv"
df.to_csv(path_to_save, index=True)