In [62]:
import pandas
import os


data_directory = '../../Data/Raw/airbnb/'
output_directory='../../Data/Processed/airbnb/'


# Loading Data 

In [63]:
airbnb_file = data_directory + '/airbnb_queens_2019.csv'

# read airbnb
df_airbnb = pandas.read_csv(airbnb_file)
print('Airbnb: ', df_airbnb.shape)

Airbnb:  (5666, 16)


In [64]:
#Print the shape of the dataset
df_airbnb.shape

(5666, 16)

In [65]:
#checking amount of rows in given dataset to understand the size we are working with

len(df_airbnb)

5666

In [66]:
#Removing the Duplicates if any
df_airbnb.duplicated().sum()
df_airbnb.drop_duplicates(inplace=True)

In [67]:
df_airbnb.shape

(5666, 16)

# Understanding, Wrangling and Cleaning Data Data 

In [69]:
#Check for the null values in each column
df_airbnb.isnull().sum()

id                                   0
name                                 0
host_id                              0
host_name                            2
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       1092
reviews_per_month                 1092
calculated_host_listings_count       0
availability_365                     0
dtype: int64

In [70]:
#Finding the Percentage of Missing Values
print(df_airbnb.isna().sum() / df_airbnb.shape[0])

id                                0.000000
name                              0.000000
host_id                           0.000000
host_name                         0.000353
neighbourhood_group               0.000000
neighbourhood                     0.000000
latitude                          0.000000
longitude                         0.000000
room_type                         0.000000
price                             0.000000
minimum_nights                    0.000000
number_of_reviews                 0.000000
last_review                       0.192729
reviews_per_month                 0.192729
calculated_host_listings_count    0.000000
availability_365                  0.000000
dtype: float64


In [71]:
#replacing all NaN values in 'reviews_per_month' and 'last_review' with 0
df_airbnb.fillna({'reviews_per_month':0}, inplace=True)
#examing changes
print (df_airbnb.reviews_per_month.isnull().sum())
print(df_airbnb.last_review.isnull().sum())

0
1092


In [72]:
#examining the unique values of neighbourhood as this column will appear very handy for later analysis
df_airbnb.neighbourhood.unique()

array(['Long Island City', 'Woodside', 'Flushing', 'Sunnyside',
       'Ridgewood', 'Jamaica', 'Middle Village', 'Ditmars Steinway',
       'Astoria', 'Queens Village', 'Rockaway Beach', 'Forest Hills',
       'Elmhurst', 'Jackson Heights', 'St. Albans', 'Rego Park',
       'Briarwood', 'Ozone Park', 'East Elmhurst', 'Arverne',
       'Cambria Heights', 'Bayside', 'Kew Gardens', 'College Point',
       'Glendale', 'Richmond Hill', 'Bellerose', 'Maspeth', 'Woodhaven',
       'Kew Gardens Hills', 'Bay Terrace', 'Whitestone', 'Bayswater',
       'Fresh Meadows', 'Springfield Gardens', 'Howard Beach',
       'Belle Harbor', 'Jamaica Estates', 'Far Rockaway',
       'South Ozone Park', 'Corona', 'Neponsit', 'Laurelton',
       'Holliswood', 'Rosedale', 'Edgemere', 'Jamaica Hills', 'Hollis',
       'Douglaston', 'Little Neck', 'Breezy Point'], dtype=object)

In [73]:
#examining the unique values of room_type as this column will appear very handy for later analysis
df_airbnb.room_type.unique()

array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)

In [74]:
#Print de datatype of the dataset
df_airbnb.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [76]:
# Feature Data
num_variables = ['latitude', 'longitude', 'price','minimum_nights','number_of_reviews','reviews_per_month',
                 'calculated_host_listings_count','availability_365']

for cname in num_variables:
    df_airbnb[cname] = df_airbnb[cname].astype(float, errors='ignore').fillna(0)

In [77]:
cat_variables = ['neighbourhood','room_type']

for cname in cat_variables:
    df_airbnb[cname] = df_airbnb[cname].astype(str)

In [78]:
df_airbnb.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                             float64
minimum_nights                    float64
number_of_reviews                 float64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count    float64
availability_365                  float64
dtype: object

In [79]:
df_airbnb['last_review'] = pandas.to_datetime(df_airbnb['last_review'].values, format='%d-%m-%y')
print('reviews: from ', df_airbnb['last_review'].min().date(), 'to', df_airbnb['last_review'].max().date())

reviews: from  2011-09-19 to 2019-07-08


In [84]:
#Avaliar a faixa dinâmica de cada variável
num_variables = ['latitude', 'longitude', 'price','minimum_nights','number_of_reviews','reviews_per_month',
                 'calculated_host_listings_count','availability_365']

df = pandas.DataFrame(index = num_variables, columns=['Min', 'Max', 'Distancia'])
for cname in num_variables:
    df.loc[cname, 'Min'] = df_airbnb[cname].min()
    df.loc[cname, 'Max'] = df_airbnb[cname].max()  

df['Distancia'] = df['Max'] - df['Min']
df

Unnamed: 0,Min,Max,Distancia
latitude,40.5655,40.7972,0.23175
longitude,-73.9593,-73.713,0.24628
price,10.0,10000.0,9990.0
minimum_nights,1.0,500.0,499.0
number_of_reviews,0.0,629.0,629.0
reviews_per_month,0.0,20.94,20.94
calculated_host_listings_count,1.0,103.0,102.0
availability_365,0.0,365.0,365.0


## Prepared data 

In [80]:
df_airbnb.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,12937,"1 Stop fr. Manhattan! Private Suite,Landmark B...",50124,Orestes,Queens,Long Island City,40.74771,-73.9474,Private room,130.0,3.0,248.0,2019-07-01,2.25,1.0,215.0
1,18198,Little King of Queens,70091,Justin,Queens,Woodside,40.75038,-73.90334,Private room,70.0,30.0,25.0,2019-05-31,0.22,1.0,324.0
2,32363,Fully Furnished Basement Apartment,140025,Fredah,Queens,Flushing,40.74028,-73.83168,Private room,140.0,2.0,1.0,2011-09-19,0.01,1.0,1.0
3,39593,"A room w/ a Manhattan view, longer stay",110506,Myung,Queens,Sunnyside,40.74559,-73.92313,Private room,79.0,30.0,28.0,2019-04-12,0.26,1.0,126.0
4,45910,Beautiful Queens Brownstone! - 5BR,204539,Mark,Queens,Ridgewood,40.70382,-73.89797,Entire home/apt,350.0,8.0,10.0,2019-05-12,0.11,5.0,365.0


# Export Data

In [20]:
help(os.makedirs)

Help on function makedirs in module os:

makedirs(name, mode=511, exist_ok=False)
    makedirs(name [, mode=0o777][, exist_ok=False])
    
    Super-mkdir; create a leaf directory and all intermediate ones.  Works like
    mkdir, except that any intermediate path segment (not just the rightmost)
    will be created if it does not exist. If the target directory already
    exists, raise an OSError if exist_ok is False. Otherwise no exception is
    raised.  This is recursive.



In [81]:
os.makedirs(output_directory, exist_ok=True)

airbnb_file = output_directory + '/airbnb.parquet'

# read 
df_airbnb.to_parquet(airbnb_file)