In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

from src.normalisers import NormaliseListings, NormaliseReviews

In [2]:
reviews = pd.read_csv('data/reviews.csv.gz')
listings = pd.read_csv('data/listings.csv.gz')
calendar = pd.read_csv('data/calendar.csv.gz')
wards = pd.read_csv('data/wards.csv')

In [3]:
# Normalise data
# Reviews
normalise_reviews = NormaliseReviews(reviews)

# Reviews tables
review_comments = normalise_reviews.normalise_reviews()
reviewers = normalise_reviews.normalise_reviewers()

#Listings
normalise_listings = NormaliseListings(listings)

# Listings tables

unique_hosts = normalise_listings.normalise_hosts()
unique_listings = normalise_listings.normalise_listings()
neighbourhoods = normalise_listings.normalise_neighbourhoods()
listing_reviews = normalise_listings.normalise_listing_reviews()
neighbourhood_overviews = normalise_listings.normalise_neighbourhood_overview()

# AirBnB Dashboard Plan

1. Divide the data according to ward (dropdown)
2. Show count of unique lisings (card)
3. Show average rating (card)
4. Show avarage price (card)
5. Total Listings (card)
6. Total Hosts (card) 
7. Percent Superhosts (card)
8. Average Response Rate (card)
9. Average Acceptance Rate (card)



In [4]:

listings_x_ratings = pd.merge(
    unique_listings, listing_reviews, on='listing_id', how='left'
)  
listings_x_ratings.head()

Unnamed: 0,listing_id,scrape_id,name,description,host_id,latitude,longitude,property_type,room_type,accommodates,...,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
0,15007,20250928034929,Blaauwberg House on the beach in Bloubergstrand,Welcome to our self-catering beach-front famil...,59072,-33.80001,18.46063,Entire home,Entire home/apt,6,...,2013-12-15,2024-10-19,4.81,4.91,4.83,4.98,4.94,4.94,4.85,0.33
1,15068,20250928034929,Grande Bay,Modern spacious apartment. Three bedrooms two...,59318,-33.78826,18.4594,Entire rental unit,Entire home/apt,6,...,,,,,,,,,,
2,15077,20250928034929,Relaxed beach living in style,Our Superior Rooms (approx. 25 m²) are located...,59342,-33.858356,18.490376,Private room in rental unit,Private room,2,...,2013-01-06,2022-06-16,5.0,4.86,4.86,4.86,4.71,4.86,5.0,0.05
3,15199,20250928034929,Self catering apartment,,59694,-33.91115,18.41235,Entire rental unit,Entire home/apt,2,...,2014-03-04,2016-04-15,4.5,4.0,4.0,4.5,5.0,5.0,4.0,0.01
4,15354,20250928034929,"Aurora Self Catering Units, Durbanville, Cape ...",Durbanville is centrally located for daytrips ...,60196,-33.83074,18.63142,Entire guest suite,Entire home/apt,3,...,,,,,,,,,,


In [5]:
listings_x_ratings.dtypes

listing_id                       int64
scrape_id                        int64
name                            object
description                     object
host_id                          int64
latitude                       float64
longitude                      float64
property_type                   object
room_type                       object
accommodates                     int64
bathrooms                      float64
bathrooms_text                  object
bedrooms                       float64
beds                           float64
amenities                       object
price_usd                      Float64
estimated_occupancy_l365d        int64
estimated_revenue_l365d        float64
instant_bookable                  bool
neighbourhood_id                 int64
number_of_reviews                int64
number_of_reviews_ltm            int64
number_of_reviews_l30d           int64
number_of_reviews_ly             int64
first_review                    object
last_review              

## Ward Dropdown

In [6]:
wards['neighbourhood_id'] = wards['Name'].str.replace('Ward', '').str.strip().astype(int)   

new_wards = pd.merge(
    neighbourhoods,
    wards,
    on='neighbourhood_id',
    how='left'
)

new_wards.drop(columns=['neighbourhood_cleansed'], inplace=True)
new_wards.columns = [col.lower() for col in new_wards.columns]
new_wards.drop_duplicates(inplace=True)
new_wards['neighbourhood_id'] = new_wards['neighbourhood_id']
new_wards.sort_values('neighbourhood_id')


Unnamed: 0,neighbourhood_id,name,latitude,longitude
28,1,Ward 1,-33.878654,18.570032
6,2,Ward 2,-33.894026,18.602100
31,3,Ward 3,-33.892196,18.656065
1,4,Ward 4,-33.855849,18.514835
9,5,Ward 5,-33.870701,18.542310
...,...,...,...,...
67,111,Ward 111,-33.850344,18.724919
3,112,,,
20,113,,,
2,115,,,


In [7]:
new_wards.loc[new_wards['name'].isna(), 'name'] = new_wards[new_wards['name'].isna()]['neighbourhood_id'].apply(lambda x: f"Ward {x}")

In [8]:
new_wards[new_wards['latitude'].isna()]

Unnamed: 0,neighbourhood_id,name,latitude,longitude
2,115,Ward 115,,
3,112,Ward 112,,
20,113,Ward 113,,
54,116,Ward 116,,


In [9]:
new_wards['neighbourhood_id'].dtype

dtype('int64')

In [10]:
unique_listings['neighbourhood_id'].dtype

dtype('int64')

In [None]:
# Get neighbourhood_ids with NaN latitude/longitude
nan_wards = new_wards[new_wards['latitude'].isna()]['neighbourhood_id'].unique().tolist()   

# For each ward with NaN coordinates, fill with mean from listings
for ward_id in nan_wards:
    #print("Filling coordinates for ward_id:", ward_id)
    mean_lat = unique_listings[unique_listings['neighbourhood_id'] == ward_id]['latitude'].mean()
    mean_lon = unique_listings[unique_listings['neighbourhood_id'] == ward_id]['longitude'].mean()
    #print("lat:", mean_lat, "lon:", mean_lon)
    new_wards.loc[new_wards['neighbourhood_id'] == ward_id, 'latitude'] = mean_lat
    new_wards.loc[new_wards['neighbourhood_id'] == ward_id, 'longitude'] = mean_lon

In [12]:
new_wards[new_wards['latitude'].isna()]

Unnamed: 0,neighbourhood_id,name,latitude,longitude


## Host Metrics

In [13]:
clean_hosts = unique_hosts.drop_duplicates(subset=['host_id'])
clean_hosts.drop(columns=['host_name', 'host_location', 'host_has_profile_pic', 'host_about', 'host_verifications'   ], inplace=True)

In [14]:
unique_hosts.head(10)

Unnamed: 0,host_id,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms
0,59072,Dirk,2009-12-01,"Cape Town, South Africa",Proud to live in the beautiful city of Cape To...,within a few hours,100.0,88.0,,7.0,19.0,"['email', 'phone', 'work_email']",,,1,1,0,0
1,59318,Linda,2009-12-02,,,a few days or more,0.0,33.0,,7.0,20.0,"['email', 'phone']",,,5,5,0,0
2,59342,Georg,2009-12-02,Gibraltar,challenge us please! we love to perform!,within a few hours,100.0,80.0,,7.0,7.0,"['email', 'phone']",,,6,1,5,0
3,59694,Alexa,2009-12-03,"Cape Town, South Africa",Professional businesswoman dedicated to provid...,,,,,2.0,2.0,"['email', 'phone']",,,1,1,0,0
4,60196,Ian,2009-12-05,"Cape Town, South Africa",,,,,,1.0,2.0,"['email', 'phone']",,,1,1,0,0
5,60443,Jean,2009-12-06,"Betty's Bay, South Africa",Like to be as accommodating and informative a...,a few days or more,0.0,0.0,,2.0,2.0,"['email', 'phone']",,,1,1,0,0
6,61441,Kevin,2009-12-09,"Cape Town, South Africa",,within a few hours,100.0,100.0,,7.0,7.0,"['email', 'phone', 'work_email']",,,7,7,0,0
7,63255,Cheryl,2009-12-15,"Cape Town, South Africa",Own Self Catering cottages in Cape Town,within an hour,100.0,83.0,,3.0,3.0,"['email', 'phone']",,,3,2,1,0
8,71221,Abe,2010-01-17,"Cape Town, South Africa",Businessman rom Cape Town,,,,,1.0,1.0,"['email', 'phone']",,,1,0,1,0
9,73439,Beverley,2010-01-25,"Cape Town, South Africa",We live in a beautiful leafy suburb of Cape To...,within a few hours,100.0,84.0,,5.0,5.0,"['email', 'phone']",,,5,5,0,0


In [15]:
unique_hosts.dtypes

host_id                                                  int64
host_name                                       string[python]
host_since                                      string[python]
host_location                                   string[python]
host_about                                      string[python]
host_response_time                              string[python]
host_response_rate                                     Float64
host_acceptance_rate                                   Float64
host_is_superhost                                      boolean
host_listings_count                                    float64
host_total_listings_count                              float64
host_verifications                              string[python]
host_has_profile_pic                                   boolean
host_identity_verified                                 boolean
calculated_host_listings_count                           int64
calculated_host_listings_count_entire_homes            

In [16]:
total_hosts = unique_hosts.shape[0]

In [17]:
percent_superhosts = unique_hosts['host_is_superhost'].sum() 
percent_superhosts 

np.int64(0)

In [18]:
ward = 5
filtered_listings = listings_x_ratings[listings_x_ratings['neighbourhood_id'] == ward]
filtered_listings.shape
filtered_listings.host_id.unique()

array([    74504,  15818381,  21219980,  32393396,  44663213, 107155568,
       124366961,  50194244,  50265230,  28667936, 178211542, 227341981,
       100626271, 253192637, 270224373, 302955257,   3186966, 177637434,
       318675654, 383701876, 413642089, 185511130, 226666918, 450927445,
       220885221, 468527699, 376163056, 484720027, 485384159, 273000713,
        26276773, 149668270, 522528059, 525213321, 478664533, 121883602,
       534282384, 539252657, 297953388, 559485375, 442657176, 119802353,
        97386261, 599471315, 106682925, 403076155, 119120718, 221827196,
         3811244,  74082146, 263182142,  15100439])

In [19]:
unique_hosts[unique_hosts['host_id'].isin(filtered_listings['host_id'].unique())]

Unnamed: 0,host_id,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms
12,74504,Ricardo,2010-01-28,,"Very neat portuguese guy, fun to be with, very...",,,,,1.0,1.0,['phone'],,,1,0,1,0
377,15818381,The Goblin Group,2014-05-21,"Cape Town, South Africa","With our selection of Guest Houses, in Saint H...",within an hour,100.0,100.0,,19.0,19.0,"['email', 'phone', 'work_email']",,,7,0,7,0
469,21219980,Natalie,2014-09-11,"Cape Town, South Africa",,within an hour,90.0,96.0,,6.0,6.0,"['email', 'phone']",,,4,4,0,0
903,32393396,Abiodun,2015-05-02,"Cape Town, South Africa",,,,,,3.0,3.0,"['email', 'phone', 'work_email']",,,3,2,1,0
1244,44663213,Philip & Glenda,2015-09-20,"Cape Town, South Africa",,within an hour,100.0,100.0,,1.0,1.0,"['email', 'phone']",,,1,1,0,0
2970,107155568,Angelo,2016-12-13,"Western Cape, South Africa",,,,100.0,,1.0,1.0,"['email', 'phone']",,,1,1,0,0
3396,124366961,Kobus,2017-04-05,,,,,,,1.0,1.0,"['email', 'phone']",,,1,1,0,0
3498,50194244,Garreth,2015-11-30,"Cape Town, South Africa",,,,,,1.0,1.0,['phone'],,,1,0,1,0
3920,50265230,Judy,2015-12-01,"Cape Town, South Africa",,within an hour,100.0,67.0,,1.0,1.0,"['email', 'phone']",,,1,1,0,0
4433,28667936,Nico,2015-03-03,"Cape Town, South Africa",My wife and I love travelling to new destinati...,within an hour,83.0,100.0,,1.0,1.0,"['email', 'phone', 'work_email']",,,1,1,0,0
