In [24]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

data_directory = '../../Data/Raw/Usa_housing/'
output_directory='../../Data/Processed/Usa_housing/'


# Carregando dados 

In [38]:
housing_file = data_directory + '/USA_Housing.csv'

# lendo housing
housing = pd.read_csv(housing_file)
print('Housing: ', housing.shape)

Housing:  (5000, 7)


In [39]:
# informações básicas
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB


In [28]:
housing.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


In [29]:
housing.describe()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,68583.108984,5.977222,6.987792,3.98133,36163.516039,1232073.0
std,10657.991214,0.991456,1.005833,1.234137,9925.650114,353117.6
min,17796.63119,2.644304,3.236194,2.0,172.610686,15938.66
25%,61480.562388,5.322283,6.29925,3.14,29403.928702,997577.1
50%,68804.286404,5.970429,7.002902,4.05,36199.406689,1232669.0
75%,75783.338666,6.650808,7.665871,4.49,42861.290769,1471210.0
max,107701.748378,9.519088,10.759588,6.5,69621.713378,2469066.0


In [30]:
housing.shape

(5000, 7)

# Compreendendo, ajustando e limpando os dados 

In [31]:
#Verificar valores nulos
housing.isnull().sum()

Avg. Area Income                0
Avg. Area House Age             0
Avg. Area Number of Rooms       0
Avg. Area Number of Bedrooms    0
Area Population                 0
Price                           0
Address                         0
dtype: int64

In [32]:
#Removendo valores duplicados
housing.duplicated().sum()
housing.drop_duplicates(inplace=True)

In [33]:
housing.shape

(5000, 7)

In [34]:
#levantando percentual de dados faltantes
print(housing.isna().sum() / housing.shape[0])

Avg. Area Income                0.0
Avg. Area House Age             0.0
Avg. Area Number of Rooms       0.0
Avg. Area Number of Bedrooms    0.0
Area Population                 0.0
Price                           0.0
Address                         0.0
dtype: float64


In [35]:
#Imprimindo os tipos de dados 
housing.dtypes

Avg. Area Income                float64
Avg. Area House Age             float64
Avg. Area Number of Rooms       float64
Avg. Area Number of Bedrooms    float64
Area Population                 float64
Price                           float64
Address                          object
dtype: object

In [40]:
#Eliminando coluna de endereço 
housing.drop(['Address'],axis=1,inplace=True)

In [41]:
housing.columns

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price'],
      dtype='object')

In [43]:
#Avaliar a faixa dinâmica de cada variável
num_variables = housing.columns

df = pd.DataFrame(index = num_variables, columns=['Min', 'Max', 'Distancia'])
for cname in housing.columns:
    df.loc[cname, 'Min'] = housing[cname].min()
    df.loc[cname, 'Max'] = housing[cname].max()  

df['Distancia'] = df['Max'] - df['Min']
df

Unnamed: 0,Min,Max,Distancia
Avg. Area Income,17796.6,107702.0,89905.1
Avg. Area House Age,2.6443,9.51909,6.87478
Avg. Area Number of Rooms,3.23619,10.7596,7.52339
Avg. Area Number of Bedrooms,2.0,6.5,4.5
Area Population,172.611,69621.7,69449.1
Price,15938.7,2469070.0,2453130.0


## Dados preparados 

In [44]:
housing.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5


# Exportar dados

In [46]:
help(os.makedirs)

Help on function makedirs in module os:

makedirs(name, mode=511, exist_ok=False)
    makedirs(name [, mode=0o777][, exist_ok=False])
    
    Super-mkdir; create a leaf directory and all intermediate ones.  Works like
    mkdir, except that any intermediate path segment (not just the rightmost)
    will be created if it does not exist. If the target directory already
    exists, raise an OSError if exist_ok is False. Otherwise no exception is
    raised.  This is recursive.



In [47]:
os.makedirs(output_directory, exist_ok=True)

housing_file = output_directory + '/housing.parquet'

 
housing.to_parquet(housing_file)