In [1]:
import pandas
import os


data_directory = '../../Data/Raw/airbnb/'
output_directory='../../Data/Processed/airbnb/'

# Load Data 

In [15]:
#Colocando o caminho do arquivo csv em uma variável.
airbnb_file = data_directory + '/airbnb_queens_2019.csv'

#Importando o arquivo csv para um DataFrame.
df_airbnb = pandas.read_csv(airbnb_file)

#Retirando as colunas desnecessárias para a análise.
#Axis = 1 representa coluna e inplace é para retirar do datafreme original.
df_airbnb.drop(['name','id','host_name','last_review','neighbourhood_group'], axis = 1, inplace = True)


#Printando o shape do DF
print('Airbnb: ', df_airbnb.shape)

Airbnb:  (5666, 11)


In [16]:
#Verificando quantos registros nulos existem em cada coluna.
print(df_airbnb.isna().sum())

host_id                              0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
reviews_per_month                 1092
calculated_host_listings_count       0
availability_365                     0
dtype: int64


In [17]:
#Verificar a porcentagem de dados faltantes
print(df_airbnb.isna().mean().round(4)*100)

host_id                            0.00
neighbourhood                      0.00
latitude                           0.00
longitude                          0.00
room_type                          0.00
price                              0.00
minimum_nights                     0.00
number_of_reviews                  0.00
reviews_per_month                 19.27
calculated_host_listings_count     0.00
availability_365                   0.00
dtype: float64


In [18]:
df_airbnb.describe()

Unnamed: 0,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,5666.0,5666.0,5666.0,5666.0,5666.0,5666.0,4574.0,5666.0,5666.0
mean,96156800.0,40.731531,-73.872775,99.517649,5.181433,27.700318,1.9412,4.060184,144.451818
std,84243240.0,0.040368,0.056988,167.102155,15.028725,51.955853,2.213108,12.445003,135.538597
min,3211.0,40.56546,-73.95927,10.0,1.0,0.0,0.01,1.0,0.0
25%,21216010.0,40.70741,-73.91742,50.0,1.0,1.0,0.37,1.0,2.0
50%,68271460.0,40.74479,-73.895045,75.0,2.0,7.0,1.21,1.0,98.0
75%,158031200.0,40.75978,-73.829602,110.0,3.0,32.0,2.79,3.0,286.0
max,274225600.0,40.79721,-73.71299,10000.0,500.0,629.0,20.94,103.0,365.0


# AIRBNB

In [19]:
df_airbnb.dtypes

host_id                             int64
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

# Fill NaN Values 

In [28]:
#Colocando em um dicionário as colunas que estavam com valor NaN e setando 0 para elas.
nan_rule = {'reviews_per_month': 0,}

#Percorrendo os dicionário e mudando os valores nulos das colunas por zero do DataFrame.
for cname, rule in nan_rule.items():
    df_airbnb.loc[:, cname] = df_airbnb[cname].fillna(rule)\
    
print(df_airbnb.shape)

(5666, 11)


In [29]:
#Verificando quantos registros nulos existem em cada coluna.
print(df_airbnb.isna().sum())

host_id                           0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64


In [30]:
#Verificar a porcentagem de dados faltantes
print(df_airbnb.isna().sum() / df_airbnb.shape[0])

host_id                           0.0
neighbourhood                     0.0
latitude                          0.0
longitude                         0.0
room_type                         0.0
price                             0.0
minimum_nights                    0.0
number_of_reviews                 0.0
reviews_per_month                 0.0
calculated_host_listings_count    0.0
availability_365                  0.0
dtype: float64


In [27]:
num_variables = ['latitude', 'longitude', 'price','minimum_nights','number_of_reviews','reviews_per_month',
                 'calculated_host_listings_count','availability_365']


In [33]:

df = pandas.DataFrame(index = num_variables, columns=['Min', 'Max', 'Distancia'])
for cname in num_variables:
    df.loc[cname, 'Min'] = df_airbnb[cname].min()
    df.loc[cname, 'Max'] = df_airbnb[cname].max()  

df['Distancia'] = df['Max'] - df['Min']

df


Unnamed: 0,Min,Max,Distancia
latitude,40.5655,40.7972,0.23175
longitude,-73.9593,-73.713,0.24628
price,10.0,10000.0,9990.0
minimum_nights,1.0,500.0,499.0
number_of_reviews,0.0,629.0,629.0
reviews_per_month,0.0,20.94,20.94
calculated_host_listings_count,1.0,103.0,102.0
availability_365,0.0,365.0,365.0


In [34]:
df_airbnb.min()

host_id                                      3211
neighbourhood                             Arverne
latitude                                  40.5655
longitude                                -73.9593
room_type                         Entire home/apt
price                                          10
minimum_nights                                  1
number_of_reviews                               0
reviews_per_month                               0
calculated_host_listings_count                  1
availability_365                                0
dtype: object

# Numeric Variables 

In [35]:
num_variables = ['latitude', 'longitude', 'price','minimum_nights','number_of_reviews','reviews_per_month',
                 'calculated_host_listings_count','availability_365']

for cname in num_variables:
    df_airbnb[cname] = df_airbnb[cname].astype(float, errors='ignore').fillna(0)

# Categorical Variables 

In [36]:
cat_variables = ['neighbourhood','room_type']

for cname in cat_variables:
    df_airbnb[cname] = df_airbnb[cname].astype(str)

# Prepared Data 

## Features 

In [37]:
df_airbnb.head()

Unnamed: 0,host_id,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,50124,Long Island City,40.74771,-73.9474,Private room,130.0,3.0,248.0,2.25,1.0,215.0
1,70091,Woodside,40.75038,-73.90334,Private room,70.0,30.0,25.0,0.22,1.0,324.0
2,140025,Flushing,40.74028,-73.83168,Private room,140.0,2.0,1.0,0.01,1.0,1.0
3,110506,Sunnyside,40.74559,-73.92313,Private room,79.0,30.0,28.0,0.26,1.0,126.0
4,204539,Ridgewood,40.70382,-73.89797,Entire home/apt,350.0,8.0,10.0,0.11,5.0,365.0


# Export Data

In [38]:
help(os.makedirs)

Help on function makedirs in module os:

makedirs(name, mode=511, exist_ok=False)
    makedirs(name [, mode=0o777][, exist_ok=False])
    
    Super-mkdir; create a leaf directory and all intermediate ones.  Works like
    mkdir, except that any intermediate path segment (not just the rightmost)
    will be created if it does not exist. If the target directory already
    exists, raise an OSError if exist_ok is False. Otherwise no exception is
    raised.  This is recursive.



In [39]:
os.makedirs(output_directory, exist_ok=True)

airbnb_file = output_directory + '/airbnb.parquet'

# read features
df_airbnb.to_parquet(airbnb_file)