In [1]:
import wandb
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import tempfile
import os
from sklearn.model_selection import train_test_split
from pandas_profiling import ProfileReport

In [2]:
# save_code tracking all changes of the notebook and sync with Wandb
run = wandb.init(project="airbnb", save_code=True)

In [3]:
# donwload the latest version of artifact raw_data.csv
artifact = run.use_artifact("airbnb/raw_data.csv:latest")

# create a dataframe from the artifact
df = pd.read_csv(artifact.file())

In [4]:
df.head()

      id                         listing_url       scrape_id last_scraped  \
0  17878  https://www.airbnb.com/rooms/17878  20211224070558   2021-12-25   
1  24480  https://www.airbnb.com/rooms/24480  20211224070558   2021-12-25   
2  25026  https://www.airbnb.com/rooms/25026  20211224070558   2021-12-25   
3  35636  https://www.airbnb.com/rooms/35636  20211224070558   2021-12-26   
4  35764  https://www.airbnb.com/rooms/35764  20211224070558   2021-12-25   

                                                name  \
0  Very Nice 2Br in Copacabana w. balcony, fast WiFi   
1   Nice and cozy near Ipanema Beach, w/ home office   
2          Beautiful Modern Decorated Studio in Copa   
3                   Cosy flat close to Ipanema beach   
4     COPACABANA SEA BREEZE  -  RIO - 20 X Superhost   

                                         description  \
0  Discounts for long term stays. <br />- Large b...   
1  My studio is located in the best of Ipanema, t...   
2  Our apartment is a little gem

In [5]:
# columns used 
columns = ['latitude', 'longitude', 'room_type', 'accommodates','neighbourhood_cleansed',
          'bedrooms','beds','price','review_scores_rating']

airbnb = airbnb[columns]
airbnb.head()

In [6]:
# donwload the latest version of artifact raw_data.csv
artifact = run.use_artifact("airbnb/raw_data.csv:latest")

# create a dataframe from the artifact
airbnb = pd.read_csv(artifact.file())

In [7]:
airbnb.head()

      id                         listing_url       scrape_id last_scraped  \
0  17878  https://www.airbnb.com/rooms/17878  20211224070558   2021-12-25   
1  24480  https://www.airbnb.com/rooms/24480  20211224070558   2021-12-25   
2  25026  https://www.airbnb.com/rooms/25026  20211224070558   2021-12-25   
3  35636  https://www.airbnb.com/rooms/35636  20211224070558   2021-12-26   
4  35764  https://www.airbnb.com/rooms/35764  20211224070558   2021-12-25   

                                                name  \
0  Very Nice 2Br in Copacabana w. balcony, fast WiFi   
1   Nice and cozy near Ipanema Beach, w/ home office   
2          Beautiful Modern Decorated Studio in Copa   
3                   Cosy flat close to Ipanema beach   
4     COPACABANA SEA BREEZE  -  RIO - 20 X Superhost   

                                         description  \
0  Discounts for long term stays. <br />- Large b...   
1  My studio is located in the best of Ipanema, t...   
2  Our apartment is a little gem

In [8]:
# columns used 
columns = ['latitude', 'longitude', 'room_type', 'accommodates','neighbourhood_cleansed',
          'bedrooms','beds','price','review_scores_rating']

airbnb = airbnb[columns]
airbnb.head()

   latitude  longitude        room_type  accommodates neighbourhood_cleansed  \
0 -22.96599  -43.17940  Entire home/apt             5             Copacabana   
1 -22.98405  -43.20189  Entire home/apt             2                Ipanema   
2 -22.97735  -43.19105  Entire home/apt             3             Copacabana   
3 -22.98839  -43.19232  Entire home/apt             2                Ipanema   
4 -22.98107  -43.19136  Entire home/apt             2             Copacabana   

   bedrooms  beds    price  review_scores_rating  
0       2.0   2.0  $350.00                  4.68  
1       1.0   1.0  $296.00                  4.73  
2       1.0   1.0  $387.00                  4.68  
3       1.0   1.0  $172.00                  4.72  
4       1.0   1.0  $260.00                  4.89  

In [9]:
airbnb['price'] = airbnb['price'].str.replace('$', '')
airbnb['price'] = airbnb['price'].str.replace(',', '')
airbnb['price'] = airbnb['price'].astype('float')
airbnb['price'] = airbnb['price'].drop(airbnb[(airbnb.price < 1) | (airbnb.price > 5000)].index)
airbnb = airbnb.dropna(axis=0)

In [10]:
# There are duplicated rows
airbnb.duplicated().sum()

2

In [11]:
# Delete duplicated rows
airbnb.drop_duplicates(inplace=True)
airbnb.duplicated().sum()

0

In [12]:
columns = ['room_type','accommodates','neighbourhood_cleansed','bedrooms','beds','price']

airbnb = airbnb[columns]

In [13]:
airbnb.info()

In [14]:
airbnb.describe()

       accommodates      bedrooms          beds         price
count  14219.000000  14219.000000  14219.000000  14219.000000
mean       4.080385      1.609115      2.592376    631.172938
std        2.306899      0.946355      2.048014    662.722128
min        1.000000      1.000000      1.000000     33.000000
25%        2.000000      1.000000      1.000000    231.000000
50%        4.000000      1.000000      2.000000    425.000000
75%        5.000000      2.000000      3.000000    786.000000
max       16.000000     20.000000     50.000000   5000.000000

In [15]:
airbnb['accommodates'] = airbnb['accommodates'].drop(airbnb[(airbnb.accommodates < 1) | (airbnb.accommodates > 10)].index)
airbnb['bedrooms'] = airbnb['bedrooms'].drop(airbnb[(airbnb.bedrooms < 1) | (airbnb.bedrooms > 10)].index)
airbnb['beds'] = airbnb['beds'].drop(airbnb[(airbnb.beds < 1) | (airbnb.beds > 15)].index)
airbnb.describe()

       accommodates      bedrooms          beds         price
count  13962.000000  14212.000000  14186.000000  14219.000000
mean       3.901733      1.602308      2.546031    631.172938
std        1.894639      0.892279      1.755809    662.722128
min        1.000000      1.000000      1.000000     33.000000
25%        2.000000      1.000000      1.000000    231.000000
50%        4.000000      1.000000      2.000000    425.000000
75%        5.000000      2.000000      3.000000    786.000000
max       10.000000     10.000000     15.000000   5000.000000

In [16]:
airbnb = airbnb.dropna()

In [17]:
airbnb.isnull().sum()

room_type                 0
accommodates              0
neighbourhood_cleansed    0
bedrooms                  0
beds                      0
price                     0
dtype: int64

In [18]:
splits = {}
splits["train"], splits["test"] = train_test_split(airbnb,
                                                   test_size=0.30,
                                                   random_state=41)

In [19]:
# Save the artifacts. We use a temporary directory so we do not leave
# any trace behind

with tempfile.TemporaryDirectory() as tmp_dir:

    for split, df in splits.items():

        # Make the artifact name from the provided root plus the name of the split
        artifact_name = f"data_{split}.csv"

        # Get the path on disk within the temp directory
        temp_path = os.path.join(tmp_dir, artifact_name)

        # Save then upload to W&B
        df.to_csv(temp_path,index=False)

        artifact = wandb.Artifact(
            name=artifact_name,
            type="raw_data",
            description=f"{split} split of dataset airbnb/raw_data.csv:latest",
        )
        artifact.add_file(temp_path)

        run.log_artifact(artifact)

        # This waits for the artifact to be uploaded to W&B. If you
        # do not add this, the temp directory might be removed before
        # W&B had a chance to upload the datasets, and the upload
        # might fail
        artifact.wait()

In [20]:
# donwload the latest version of artifacts data_test.csv and data_train.csv
artifact_train = run.use_artifact("airbnb/data_train.csv:latest")
artifact_test = run.use_artifact("airbnb/data_test.csv:latest")

# create a dataframe from each artifact
df_train = pd.read_csv(artifact_train.file())
df_test  = pd.read_csv(artifact_test.file())

In [21]:
print("Train: {}".format(df_train.shape))
print("Test: {}".format(df_test.shape))

In [22]:
run.finish()