### OVERALL LOOK AT THE DATASET AND SILVER LAYER

In [1]:
import pandas as pd
import numpy as np
import pymongo
import re
from jproperties import Properties
from sqlalchemy import create_engine, text
from sqlalchemy import Numeric
import subprocess


# pandas settings
pd.set_option('max.colwidth', 200)
pd.set_option('display.max_rows', 50)

In [2]:
# loading variables from file (db access)

with open("app.properties", "r+b") as f:
    p = Properties()
    p.load(f, "utf-8")

### MongoDB Acess (Data Source)

In [3]:

client = pymongo.MongoClient()
db = client.get_database('zap_imoveis')
bronze_layer = db.get_collection('bronze')

In [4]:
print(f'Number of data records in the database: {bronze_layer.count_documents({})}')

Number of data records in the database: 21878


### Creating a DataFrame


In [5]:

df = pd.DataFrame(list(bronze_layer.find({})))

assert df[df['data-id'].duplicated()]['data-id'].sum() == 0, 'We have duplicated ids (data-id) in the DB'

In [6]:
df.head(2)

Unnamed: 0,_id,link,data-id,atts,date
0,67a344c52b111f021af8a118,https://www.zapimoveis.com.br/imovel/aluguel-apartamento-2-quartos-com-piscina-saude-zona-sul-sao-paulo-sp-112m2-id-2774950239/,2774950239,"{'ad_type': 'Aluguel', 'sell_price': None, 'rent_price': 'R$ 5.000/mês', 'condo_price': 'R$ 970', 'address': 'Rua Doutor Samuel Porto, 237 - Saúde, São Paulo - SP', 'rstate_type': 'Apartamentos pa...",20250224
1,67a344cc2b111f021af8a119,https://www.zapimoveis.com.br/imovel/aluguel-apartamento-1-quarto-com-piscina-cambuci-zona-sul-sao-paulo-sp-37m2-id-2733414161/,2733414161,"{'ad_type': 'Aluguel', 'sell_price': None, 'rent_price': 'R$ 2.400/mês', 'condo_price': 'R$ 980', 'address': 'Rua Backer, 338 - Cambuci, São Paulo - SP', 'rstate_type': 'Apartamentos para Alugar/'...",20250224


### Getting Only The Attributes  

In [7]:
df_atts =  pd.concat(
        [df[['data-id','link']], pd.json_normalize(df['atts'])],
        axis = 1
)

df_atts.head(2)

Unnamed: 0,data-id,link,ad_type,sell_price,rent_price,condo_price,address,rstate_type,floorSize,numberOfRooms,...,BEAUTY_ROOM,VINYL_FLOOR,DRESS_ROOM,CORRAL,DRYWALL,GOLF_FIELD,SMART_CONDOMINIUM,ECO_CONDOMINIUM,DIVIDERS,CARPET
0,2774950239,https://www.zapimoveis.com.br/imovel/aluguel-apartamento-2-quartos-com-piscina-saude-zona-sul-sao-paulo-sp-112m2-id-2774950239/,Aluguel,,R$ 5.000/mês,R$ 970,"Rua Doutor Samuel Porto, 237 - Saúde, São Paulo - SP",Apartamentos para Alugar/,112 m²,2 quartos,...,,,,,,,,,,
1,2733414161,https://www.zapimoveis.com.br/imovel/aluguel-apartamento-1-quarto-com-piscina-cambuci-zona-sul-sao-paulo-sp-37m2-id-2733414161/,Aluguel,,R$ 2.400/mês,R$ 980,"Rua Backer, 338 - Cambuci, São Paulo - SP",Apartamentos para Alugar/,37 m²,1 quarto,...,,,,,,,,,,


In [8]:
## removing comercial as we are not interested for this analysis;

df_atts = df_atts[~df_atts.rstate_type.apply(lambda x: True if re.search('Comerciais', str(x)) else False)]
print(df_atts.rstate_type.unique())

['Apartamentos para Alugar/' 'Flats para Alugar/' 'Imóveis para Alugar/'
 'Casas para Alugar/' 'Apartamentos à Venda/' 'Coberturas à Venda/'
 'Sobrados à Venda/' 'Casas de Condomínio para Alugar/'
 'Sobrados para Alugar/' 'Studios para Alugar/'
 'Casas de Condomínio à Venda/' 'Coberturas para Alugar/' 'Casas à Venda/'
 'Casas de Vila à Venda/' 'Kitnets para Alugar/' 'Flats à Venda/'
 'Studios à Venda/' 'Casas de Vila para Alugar/' 'Lofts para Alugar/'
 'Imóveis à Venda/' 'Fazendas, Sítios e Chácaras para Alugar/'
 'Fazendas, Sítios e Chácaras à Venda/' 'Lofts à Venda/'
 'Prédios Inteiros para Alugar/']


### Selecting Columns Of Interest

In [9]:
# for _ in df_atts.columns:
#      print(_)

cols_of_interest = [
    'data-id',
    'ad_type', 'rstate_type',
    'rent_price', 'condo_price', 'sell_price',
    'address', 'floorSize', 'numberOfBathroomsTotal',
    'numberOfParkingSpaces',
    'numberOfRooms',
    'numberOfSuites',
    'FURNISHED',
    'AIR_CONDITIONING',
    'HOME_OFFICE',
    'POOL',
    'GYM',
    'SAUNA',
    'LAUNDRY',
    'COWORKING',
    'link'
    ]

# keep only the cols that exist in the dataset; 

cols_of_interest = [x for x in cols_of_interest if x in df_atts.columns]

# change the df
df_atts = df_atts[cols_of_interest]


### % of NaN Values

In [10]:
missing = df_atts.isna().sum()
missing = missing/df_atts.shape[0] * 100
missing.sort_values(ascending=False)

## comments; 
# the null values on the ameneties represents the absence of such elements like GYM or LAUNDRY.
# for others that are mandatory (like having at least sell or rent price and the floor size), its more likely to be an  error that occurred when web scraping. 

COWORKING                 99.277318
SAUNA                     91.030508
HOME_OFFICE               89.822989
GYM                       88.382198
LAUNDRY                   87.526872
FURNISHED                 83.986644
AIR_CONDITIONING          80.107945
POOL                      73.109820
sell_price                67.342085
numberOfSuites            20.006404
numberOfParkingSpaces      6.911220
numberOfRooms              0.365915
rent_price                 0.082331
numberOfBathroomsTotal     0.009148
rstate_type                0.000000
ad_type                    0.000000
data-id                    0.000000
condo_price                0.000000
floorSize                  0.000000
address                    0.000000
link                       0.000000
dtype: float64

In [11]:
df_atts[df_atts.numberOfRooms.isna()]

Unnamed: 0,data-id,ad_type,rstate_type,rent_price,condo_price,sell_price,address,floorSize,numberOfBathroomsTotal,numberOfParkingSpaces,...,numberOfSuites,FURNISHED,AIR_CONDITIONING,HOME_OFFICE,POOL,GYM,SAUNA,LAUNDRY,COWORKING,link
627,2732561794,Aluguel,Casas para Alugar/,R$ 7.100/mês,não informado,,"Rua Amaro Guerra, 781 - Vila São Francisco Zona Sul, São Paulo - SP",180 m²,3 banheiros,8 vagas,...,,,,,,,,,,https://www.zapimoveis.com.br/imovel/aluguel-casa-vila-sao-francisco-zona-sul-sao-paulo-180m2-id-2732561794/
655,2774886476,Aluguel,Casas para Alugar/,R$ 14.000/mês,isento,,"Rua Euclides Pacheco, 350 - Tatuapé, São Paulo - SP",500 m²,7 banheiros,,...,,,,,,,,,,https://www.zapimoveis.com.br/imovel/aluguel-casa-vila-gomes-cardim-sao-paulo-500m2-id-2774886476/
755,2763884729,Aluguel,Casas para Alugar/,R$ 12.500/mês,não informado,,"Rua Doutor José Gustavo Bush, 350 - Panamby, São Paulo - SP",200 m²,6 banheiros,4 vagas,...,,,,,,,,,,https://www.zapimoveis.com.br/imovel/aluguel-casa-paraiso-do-morumbi-sao-paulo-200m2-id-2763884729/
1738,2727319668,Aluguel,Casas para Alugar/,R$ 5.800/mês,não informado,,"Rua Monte Serrat, 941 - Tatuapé, São Paulo - SP",120 m²,2 banheiros,1 vaga,...,,,,,,,,,,https://www.zapimoveis.com.br/imovel/aluguel-casa-com-area-de-servico-tatuape-sao-paulo-120m2-id-2727319668/
2332,2777283759,Venda,Casas à Venda/,R$ 24.000/mês,isento,R$ 4.500.000,"Alameda Joaquim Eugênio de Lima, 1489 - Jardim Paulista, São Paulo - SP",375 m²,4 banheiros,6 vagas,...,,,,,,,,,,https://www.zapimoveis.com.br/imovel/venda-casa-jardim-paulista-sao-paulo-375m2-id-2777283759/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21427,2799890815,Aluguel,Casas para Alugar/,R$ 18.000/mês,R$ 1,,"Rua Indiana - Brooklin, São Paulo - SP",200 m²,2 banheiros,3 vagas,...,,,,,,,,,,https://www.zapimoveis.com.br/imovel/aluguel-casa-brooklin-paulista-sao-paulo-200m2-id-2799890815/?source=ranking%2Crp
21436,2647007863,Aluguel,Casas para Alugar/,R$ 11.000/mês,isento,,"Avenida Vereador José Diniz, 2302 - Brooklin, São Paulo - SP",295 m²,3 banheiros,3 vagas,...,,,,Escritório,,,,,,https://www.zapimoveis.com.br/imovel/aluguel-casa-com-churrasqueira-santo-amaro-sao-paulo-295m2-id-2647007863/?source=ranking%2Crp
21494,2790468617,Aluguel,Casas para Alugar/,R$ 10.000/mês,R$ 1,,"Rua Clélia - Água Branca, São Paulo - SP",292 m²,2 banheiros,,...,,,,,,,,,,https://www.zapimoveis.com.br/imovel/aluguel-casa-agua-branca-sao-paulo-292m2-id-2790468617/?source=ranking%2Crp
21785,2802162724,Aluguel,Casas para Alugar/,R$ 13.000/mês,não informado,,"Tatuapé, São Paulo - SP",195 m²,3 banheiros,4 vagas,...,,,,,,,,,,https://www.zapimoveis.com.br/imovel/aluguel-casa-tatuape-sao-paulo-195m2-id-2802162724/?source=ranking%2Crp


### Removing Null Values
<li> if m2 = 0/NA or both rent and sell price are 0/NA, we'll remove the instance


In [12]:
df_atts.shape

(21863, 21)

In [13]:
df_atts = df_atts[~df_atts.floorSize.isnull()]

df_atts = df_atts[
    ~(df_atts.rent_price.isnull()) |
    ~(df_atts.sell_price.isnull())
    ]

In [14]:
df_atts.shape

(21863, 21)

### Data Transformation 

<li> Dtypes


In [15]:
# FLOAT 

def func(x):
    p = re.compile('[\d\.\,\- ]+')

    match = p.search(str(x))

    if match:
        return match.group(0).replace('.', '')
    else:
        return x



cols_float = ['sell_price', 'rent_price', 'condo_price']

df_atts = df_atts.replace({'isento': np.nan,'não informado': np.nan})

df_atts[cols_float] = df_atts[cols_float].map(func).replace('.', '')

df_atts[cols_float] = df_atts[cols_float].astype(float)
      

<li> Boolean


In [16]:

cols_bool = ['AIR_CONDITIONING', 'FURNISHED','HOME_OFFICE', 'POOL','GYM', 'LAUNDRY', 'SAUNA', 'COWORKING'] # desired;

cols_bool = [col for col in cols_bool if col in df_atts.columns] # ajusted in relation to the dataset;



df_atts.loc[:, cols_bool] = df_atts[cols_bool].fillna(0) # 0 represents the unpresence of such property; 
df_atts.loc[:, cols_bool]= df_atts[cols_bool].replace(to_replace= ('^(?!0).*'), value = 1, regex= True) # all the rest can be replaced by 1 (True)

  df_atts.loc[:, cols_bool]= df_atts[cols_bool].replace(to_replace= ('^(?!0).*'), value = 1, regex= True) # all the rest can be replaced by 1 (True)


<li> Integer


In [17]:

cols_int = ['floorSize', 'numberOfBathroomsTotal', 'numberOfParkingSpaces', 'numberOfRooms'] + cols_bool
df_atts[cols_int] = df_atts[cols_int].map(func)
df_atts[cols_int] = df_atts[cols_int].fillna(0).astype(int)

### Feature Engineering


In [18]:
def regex_neighbor(x):

    match=re.search('(?<=- )[\w ]+', str(x))
    if match:
        return match.group()
    else:
        return x

df_atts.loc[:, 'neighborhood'] = df_atts.address.apply(regex_neighbor)
df_atts.loc[:, 'numberOfSuites'] = df_atts.numberOfSuites.fillna(0).apply(lambda x: re.search('\d+', str(x)).group())
df_atts['total_rental_price'] = df_atts.condo_price + df_atts.rent_price

# to int;
df_atts['numberOfSuites'] = df_atts.numberOfSuites.astype(int)

In [19]:
for index, cols in df_atts.iterrows():
    
    if not isinstance(cols.address, float):
        address = cols.address.split(',')
        
        n = len(address)

        df_atts.loc[index, 'neighborhood']  = regex_neighbor(address[-2])

        if n == 3: # means we have street name;

            df_atts.loc[index, 'street'] = address[-3]
            df_atts.loc[index, 'number'] = address[-2].split('-')[0].strip()

  
df_atts = df_atts.drop(columns = ['address']) # we can drop; 

In [20]:
df_atts.dtypes

data-id                    object
ad_type                    object
rstate_type                object
rent_price                float64
condo_price               float64
sell_price                float64
floorSize                   int64
numberOfBathroomsTotal      int64
numberOfParkingSpaces       int64
numberOfRooms               int64
numberOfSuites              int64
FURNISHED                   int64
AIR_CONDITIONING            int64
HOME_OFFICE                 int64
POOL                        int64
GYM                         int64
SAUNA                       int64
LAUNDRY                     int64
COWORKING                   int64
link                       object
neighborhood               object
total_rental_price        float64
street                     object
number                     object
dtype: object

### Data Cleaning

In [21]:
df_atts.loc[:, 'rstate_type'] = df_atts.rstate_type.str[:-1]  # removing '/' at final of each string;


In [22]:


cols_reord = [
             'data-id', 'rstate_type', 'ad_type',
             'rent_price',  'sell_price',  'condo_price', 'total_rental_price', 'floorSize',
             'neighborhood', 'street', 'number'] + cols_bool + \
            ['numberOfRooms', 'numberOfSuites', 'numberOfBathroomsTotal', 'numberOfParkingSpaces', 'link']

df_atts = df_atts[cols_reord]


In [23]:
renamed_cols = {'data-id': 'id', 
                'rstate_type': 'kind', 'ad_type': 'rent_or_selling', 
                'rent_price':'price_rent', 'sell_price': 'price_sale', 'condo_price': 'price_condominium', 'total_rental_price': 'rent_plus_condo', 
                'floorSize': 'floor_size', 'number': 'address_number',  
                'AIR_CONDITIONING': 'air_conditioning', 'FURNISHED': 'furnished', 'HOME_OFFICE': 'home_office', 
                "POOL": 'pool', "GYM": 'gym', "LAUNDRY": 'laundry', 'SAUNA': 'sauna',
                'numberOfRooms': 'rooms', 'numberOfSuites': 'suites', 'numberOfBathroomsTotal': 'bathrooms', 'numberOfParkingSpaces': 'parking'}


df_atts = df_atts.rename(columns = renamed_cols)

In [24]:
df_atts.head(2)

Unnamed: 0,id,kind,rent_or_selling,price_rent,price_sale,price_condominium,rent_plus_condo,floor_size,neighborhood,street,...,pool,gym,laundry,sauna,COWORKING,rooms,suites,bathrooms,parking,link
0,2774950239,Apartamentos para Alugar,Aluguel,5000.0,,970.0,5970.0,112,Saúde,Rua Doutor Samuel Porto,...,1,0,0,0,0,2,1,3,3,https://www.zapimoveis.com.br/imovel/aluguel-apartamento-2-quartos-com-piscina-saude-zona-sul-sao-paulo-sp-112m2-id-2774950239/
1,2733414161,Apartamentos para Alugar,Aluguel,2400.0,,980.0,3380.0,37,Cambuci,Rua Backer,...,1,0,0,0,0,1,1,1,1,https://www.zapimoveis.com.br/imovel/aluguel-apartamento-1-quarto-com-piscina-cambuci-zona-sul-sao-paulo-sp-37m2-id-2733414161/


### Validating Duplicated Listing

In [25]:
# Needed because the same house can be listed by different real estate agencies

columns = list(df_atts.columns)
for i in ['id', 'link']:
    columns.remove(i)

print(f'Number of duplicated listing based on REstate properties: {df_atts.duplicated(subset= columns).sum()}')

Number of duplicated listing based on REstate properties: 256


In [26]:
df_atts = df_atts.drop_duplicates(subset=columns)

df_atts.shape

(21607, 24)

### Estatistical Cleaning

<li> outliers

In [27]:
df_atts_sell = df_atts[~df_atts.price_sale.isna()]
df_atts_rent = df_atts[~df_atts.price_rent.isna()]  # as some adtype listed as "For Sale" may contain the price and be able to rent, we can use .isna()

In [28]:
def plot_box_plot(df, sell_or_rent):

    if sell_or_rent == 'rent':
        box_data = df.price_rent / df.floor_size

    elif sell_or_rent == 'sell':
        box_data = df.price_sale / df.floor_size
    
    else:
        raise NameError

    Q1 = np.percentile(box_data, 25)
    Q3 = np.percentile(box_data, 75)
    IQR = Q3 - Q1

    u_bound = Q3 + 1.5*IQR
    l_bound = Q1 - 1.5*IQR

    return {'u_bound': u_bound, 'l_bound': l_bound}

In [29]:
# IQR to remove outliers based on price/floor_size

rent_bounds = plot_box_plot(df_atts_rent, 'rent')
df_atts_rent_adjusted = df_atts_rent[
    ((df_atts_rent.price_rent / df_atts_rent.floor_size) < rent_bounds.get('u_bound')) &
   ( (df_atts_rent.price_rent / df_atts_rent.floor_size) > rent_bounds.get('l_bound') )
]

sale_bounds = plot_box_plot(df_atts_sell, 'sell')
df_atts_sale_adjusted = df_atts_sell[
    ((df_atts_sell.price_rent / df_atts_sell.floor_size) < sale_bounds.get('u_bound')) &
   ( (df_atts_sell.price_rent / df_atts_sell.floor_size) > sale_bounds.get('l_bound') )
]


In [30]:
# making a new index to slice the original dataframe;
new_index = np.concatenate((df_atts_rent_adjusted.index,df_atts_sale_adjusted.index))
new_index = list(set(new_index))

df_atts = df_atts.loc[new_index, :]

In [31]:
df_atts

Unnamed: 0,id,kind,rent_or_selling,price_rent,price_sale,price_condominium,rent_plus_condo,floor_size,neighborhood,street,...,pool,gym,laundry,sauna,COWORKING,rooms,suites,bathrooms,parking,link
0,2774950239,Apartamentos para Alugar,Aluguel,5000.0,,970.0,5970.0,112,Saúde,Rua Doutor Samuel Porto,...,1,0,0,0,0,2,1,3,3,https://www.zapimoveis.com.br/imovel/aluguel-apartamento-2-quartos-com-piscina-saude-zona-sul-sao-paulo-sp-112m2-id-2774950239/
1,2733414161,Apartamentos para Alugar,Aluguel,2400.0,,980.0,3380.0,37,Cambuci,Rua Backer,...,1,0,0,0,0,1,1,1,1,https://www.zapimoveis.com.br/imovel/aluguel-apartamento-1-quarto-com-piscina-cambuci-zona-sul-sao-paulo-sp-37m2-id-2733414161/
2,2764955055,Flats para Alugar,Aluguel,2880.0,,1920.0,4800.0,30,Vila Olímpia,Rua Gomes de Carvalho,...,0,0,0,0,0,1,1,1,1,https://www.zapimoveis.com.br/imovel/aluguel-flat-1-quarto-mobiliado-vila-olimpia-zona-sul-sao-paulo-sp-30m2-id-2764955055/
3,2776250587,Apartamentos para Alugar,Aluguel,2100.0,,674.0,2774.0,47,Vila Mariana,Rua Bartolomeu de Gusmão,...,0,0,0,0,0,1,0,1,1,https://www.zapimoveis.com.br/imovel/aluguel-apartamento-1-quarto-com-elevador-vila-mariana-sao-paulo-47m2-id-2776250587/
4,2776223814,Apartamentos para Alugar,Aluguel,2100.0,,700.0,2800.0,55,Butantã,Avenida Engenheiro Heitor Antônio Eiras Garcia,...,0,0,0,0,0,2,0,1,1,https://www.zapimoveis.com.br/imovel/aluguel-apartamento-2-quartos-com-churrasqueira-jardim-esmeralda-sao-paulo-55m2-id-2776223814/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21873,2802158550,Casas para Alugar,Aluguel,4990.0,,,,194,Tatuapé,,...,0,0,0,0,0,4,0,1,1,https://www.zapimoveis.com.br/imovel/aluguel-casa-4-quartos-vila-regente-feijo-sao-paulo-194m2-id-2802158550/?source=ranking%2Crp
21874,2578165196,Apartamentos para Alugar,Aluguel,12000.0,,4730.0,16730.0,280,Higienópolis,,...,0,0,0,0,0,3,0,2,2,https://www.zapimoveis.com.br/imovel/aluguel-apartamento-3-quartos-higienopolis-sao-paulo-280m2-id-2578165196/?source=ranking%2Crp
21875,2800645914,Sobrados para Alugar,Aluguel,5400.0,,,,280,Vila Guilherme,,...,0,0,0,0,0,5,0,3,5,https://www.zapimoveis.com.br/imovel/aluguel-sobrados-5-quartos-vila-guilherme-sao-paulo-280m2-id-2800645914/?source=ranking%2Crp
21876,2790500314,Casas para Alugar,Aluguel,7950.0,,,,280,Perdizes,,...,0,0,1,0,0,3,1,5,5,https://www.zapimoveis.com.br/imovel/aluguel-casa-3-quartos-com-churrasqueira-perdizes-sao-paulo-280m2-id-2790500314/?source=ranking%2Crp


### Postgres Ingestion;


In [None]:
# db credentials come from the file app.properties

uri = p.get('uri').data
engine = create_engine(uri)

with engine.connect() as con:
    con.execute(text("CREATE SCHEMA if not exists zap"))
    con.commit()

In [None]:
# drop views to update silver table; 
with engine.connect() as conn:
    views_list = conn.execute(
    text("""select table_name 
            from information_schema.views
            where table_schema = 'zap' """)).fetchall() # getting the view's name that exists;


    for view in views_list:  # dropping them because it's needed;
        conn.execute(text(f"DROP VIEW zap.{view[0]}"))
        conn.commit() 


df_atts.to_sql('silver_zapimoveis', schema='zap', # now its gonna work;
    con = uri, if_exists='replace', index = False, 
    dtype={'price_rent': Numeric, 'price_sale': Numeric, 'price_condominium': Numeric, 'rent_plus_condo': Numeric})


In [None]:
### call run dbt;
subprocess.run("dbt run", shell=True)

### END