In [30]:
# importando bibliotecas
import pandas as pd
import numpy as np

# exportando dados p um dataframe
realtor = pd.read_csv('realtor-data.csv')

# verificando exportação
realtor.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,,105000.0
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,,80000.0
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,,67000.0
3,for_sale,4.0,2.0,0.1,Ponce,Puerto Rico,731.0,1800.0,,145000.0
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,,,65000.0


In [31]:
# verificando valores nulos 
realtor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   status          100000 non-null  object 
 1   bed             75050 non-null   float64
 2   bath            75112 non-null   float64
 3   acre_lot        85987 non-null   float64
 4   city            99948 non-null   object 
 5   state           100000 non-null  object 
 6   zip_code        99805 non-null   float64
 7   house_size      75082 non-null   float64
 8   prev_sold_date  28745 non-null   object 
 9   price           100000 non-null  float64
dtypes: float64(6), object(4)
memory usage: 7.6+ MB


In [32]:
#dropando valores nulos nas colunas status, city, state, prev_sold_date, ou seja colunas que o valor não poderia estar nulo
realtor = realtor.dropna(subset=['status', 'city', 'state', 'prev_sold_date'])
realtor.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28745 entries, 684 to 99999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   status          28745 non-null  object 
 1   bed             27108 non-null  float64
 2   bath            27118 non-null  float64
 3   acre_lot        25238 non-null  float64
 4   city            28745 non-null  object 
 5   state           28745 non-null  object 
 6   zip_code        28745 non-null  float64
 7   house_size      27264 non-null  float64
 8   prev_sold_date  28745 non-null  object 
 9   price           28745 non-null  float64
dtypes: float64(6), object(4)
memory usage: 2.4+ MB


In [34]:
# verificando quantos valores ausentes possui as colunas bed e bath
realtor.isna().sum()

status               0
bed               1637
bath              1627
acre_lot          3507
city                 0
state                0
zip_code             0
house_size        1481
prev_sold_date       0
price                0
dtype: int64

In [35]:
# substituindo valores ausentes nas colunas bath e bed por zero
realtor = realtor.fillna({'bed': 0, 'bath': 0 })

# filtrando para ver se foi aplicado
realtor[(realtor['bed'] == 0)]

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price
5163,for_sale,0.0,0.0,78.66,Yabucoa,Puerto Rico,767.0,,2021-03-15,380000.0
5195,for_sale,0.0,0.0,78.66,Yabucoa,Puerto Rico,767.0,,2021-03-15,380000.0
5198,for_sale,0.0,0.0,78.66,Yabucoa,Puerto Rico,767.0,,2021-03-15,380000.0
5215,for_sale,0.0,0.0,78.66,Yabucoa,Puerto Rico,767.0,,2021-03-15,380000.0
7358,for_sale,0.0,0.0,78.66,Yabucoa,Puerto Rico,767.0,,2021-03-15,380000.0
...,...,...,...,...,...,...,...,...,...,...
99315,for_sale,0.0,0.0,2.33,Leicester,Massachusetts,1524.0,,1988-05-05,64900.0
99328,for_sale,0.0,0.0,6.80,Auburn,Massachusetts,1501.0,,1997-03-14,179900.0
99376,for_sale,0.0,3.0,5.55,Woodstock,Connecticut,6281.0,7250.0,1991-01-24,785000.0
99398,for_sale,0.0,0.0,0.17,Worcester,Massachusetts,1604.0,,2008-03-31,84900.0


In [36]:
# verificando quantos missing values possui a coluna house_size
realtor.isna().sum()

status               0
bed                  0
bath                 0
acre_lot          3507
city                 0
state                0
zip_code             0
house_size        1481
prev_sold_date       0
price                0
dtype: int64

In [37]:
# substituindo os missing values das colunas house_size pela mediana
realtor = realtor.fillna({'house_size': realtor['house_size'].median()})

# verificando
realtor.isna().sum()

status               0
bed                  0
bath                 0
acre_lot          3507
city                 0
state                0
zip_code             0
house_size           0
prev_sold_date       0
price                0
dtype: int64

In [44]:
# transformando string para datetime
realtor['prev_sold_date'] = pd.to_datetime(realtor['prev_sold_date'])

# extraindo somente o ano de cada data
realtor['prev_sold_year'] = realtor['prev_sold_date'].dt.year

# agrupando por estado a partir dos valores de cada ano 
realtor_final = realtor.pivot_table(index='state', columns='prev_sold_year', values='price', aggfunc='sum')
realtor_final

prev_sold_year,1901,1910,1961,1966,1968,1971,1972,1973,1974,1975,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Connecticut,,,,653800.0,739600.0,,,3579900.0,394900.0,,...,124022290.0,101067952.0,101077835.0,111999589.0,154613095.0,108130999.0,126171670.0,127437737.0,190742406.0,27440776.0
Massachusetts,3051000.0,2421000.0,3395000.0,1700000.0,4822300.0,10778300.0,599900.0,740000.0,3999000.0,10180000.0,...,56510300.0,65877272.0,57430500.0,52369500.0,76126700.0,104585600.0,211786000.0,143509854.0,164632708.0,58369600.0
New Hampshire,,,,,,,,,,,...,3697600.0,9684000.0,22584645.0,22499200.0,20282989.0,28527700.0,24910500.0,17102900.0,27641295.0,
New York,,,,,,,,719600.0,50000.0,,...,1033000.0,8853500.0,44547000.0,33414900.0,9817600.0,17454247.0,28245900.0,12544200.0,13484600.0,4654700.0
Puerto Rico,,,,,,,,,,,...,,,,,,,2200000.0,999700.0,6786000.0,2273400.0
Rhode Island,,,,,,,,,,,...,19174272.0,13077599.0,20523495.0,31810100.0,35881975.0,38024300.0,42829395.0,43555256.0,65702277.0,10502198.0
Vermont,,,,,,,,,,,...,3863300.0,3293000.0,885500.0,5297000.0,5014000.0,3029800.0,9099000.0,9704000.0,,
Virgin Islands,,,,,,,,,,,...,2850000.0,,,768000.0,,20697000.0,,,,
