In [1]:
%%capture
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import patchworklib as pw

from py_helper_functions import *

from mizani.transforms import log_trans
from mizani.formatters import percent_format
from mizani.formatters import log_format

from stargazer import stargazer

In [12]:
def get_cleaned_data(src: str) -> pd.DataFrame:
    '''
    This function takes a path to a csv file, cleans it and returns the cleaned dataframe
    :param src: path to file
    :return: dataframe with cleaned data
    '''
    df = pd.read_csv(src)
    
    # Filter the data for:
    #
    working_sample = df[(df['price'].notna()) & (df['beds'].notna())]
    
    return working_sample

In [13]:
test_df = get_cleaned_data('listings.csv')
test_df.shape

(53655, 75)

In [14]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53655 entries, 0 to 56684
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            53655 non-null  int64  
 1   listing_url                                   53655 non-null  object 
 2   scrape_id                                     53655 non-null  int64  
 3   last_scraped                                  53655 non-null  object 
 4   source                                        53655 non-null  object 
 5   name                                          53655 non-null  object 
 6   description                                   0 non-null      float64
 7   neighborhood_overview                         25616 non-null  object 
 8   picture_url                                   53655 non-null  object 
 9   host_id                                       53655 non-null  int6

In [15]:
test_df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,7011,https://www.airbnb.com/rooms/7011,20231229030255,2023-12-31,city scrape,Home in Marsala · ★4.68 · 3 bedrooms · 4 beds ...,,,https://a0.muscache.com/pictures/miso/Hosting-...,16888,...,4.71,4.57,4.57,,t,10,10,0,0,0.18
1,22948,https://www.airbnb.com/rooms/22948,20231229030255,2023-12-31,city scrape,Bed and breakfast in Agrigento · ★4.80 · 1 bed...,,"The neighborhood is in a residential area, cen...",https://a0.muscache.com/pictures/miso/Hosting-...,88837,...,4.8,4.89,4.85,19084001C101881,f,3,0,3,0,2.46
2,35264,https://www.airbnb.com/rooms/35264,20231229030255,2023-12-30,city scrape,Farm stay in Piedimonte Etneo · ★4.79 · 4 bedr...,,,https://a0.muscache.com/pictures/62402822/1cc2...,151644,...,4.81,4.85,4.76,,f,5,5,0,0,0.94
4,43992,https://www.airbnb.com/rooms/43992,20231229030255,2023-12-31,city scrape,Vacation home in Province of Trapani · ★4.65 ·...,,,https://a0.muscache.com/pictures/9495798/e9cfb...,192525,...,4.94,4.5,4.65,19081008C203276,f,2,2,0,0,0.48
5,43995,https://www.airbnb.com/rooms/43995,20231229030255,2023-12-29,city scrape,Home in San Vito Lo Capo · ★4.82 · 3 bedrooms ...,,,https://a0.muscache.com/pictures/9498289/eef27...,192525,...,4.91,4.7,4.68,19081020C204689,f,2,2,0,0,0.43


In [16]:
test_df.isna().sum()

id                                                  0
listing_url                                         0
scrape_id                                           0
last_scraped                                        0
source                                              0
                                                ...  
calculated_host_listings_count                      0
calculated_host_listings_count_entire_homes         0
calculated_host_listings_count_private_rooms        0
calculated_host_listings_count_shared_rooms         0
reviews_per_month                               15986
Length: 75, dtype: int64

In [8]:
test_df[['id', 'price']]

Unnamed: 0,id,price
0,7011,$230.00
1,22948,$217.00
2,35264,$136.00
3,38974,
4,43992,$70.00
...,...,...
56680,1051420622489213865,$715.00
56681,1056340324556886825,$75.00
56682,1051422045186465160,$64.00
56683,1051462180284404035,$31.00
