# Airbnb Data Deep Dive – Python + Pandas Challenge

In [28]:
#!pip install pandas
import pandas as pd
# Display settings
pd.set_option('display.max_columns', None)


# Data Loading and Initial Exploration

### Here, we are going to read in the data and store it in a df. We will go further to have basic understanding by exploring the data a little

In [29]:
df = pd.read_csv("listings.csv")  # importing my file


### info() gives a bit of overview

In [30]:
df.info() # getting some basic info on the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96651 entries, 0 to 96650
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              96651 non-null  int64  
 1   name                            96651 non-null  object 
 2   host_id                         96651 non-null  int64  
 3   host_name                       96611 non-null  object 
 4   neighbourhood_group             0 non-null      float64
 5   neighbourhood                   96651 non-null  object 
 6   latitude                        96651 non-null  float64
 7   longitude                       96651 non-null  float64
 8   room_type                       96651 non-null  object 
 9   price                           62684 non-null  float64
 10  minimum_nights                  96651 non-null  int64  
 11  number_of_reviews               96651 non-null  int64  
 12  last_review                     

In [31]:
df.describe()


Unnamed: 0,id,host_id,neighbourhood_group,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
count,96651.0,96651.0,0.0,96651.0,96651.0,62684.0,96651.0,96651.0,71487.0,96651.0,96651.0,96651.0,0.0
mean,6.52602e+17,209179000.0,,51.509818,-0.127087,213.366058,5.429504,20.891734,0.958877,16.38937,139.697365,5.634665,
std,5.708808e+17,214126600.0,,0.048945,0.100853,860.901557,23.315086,49.922266,1.282595,53.299577,137.426817,11.951389,
min,13913.0,2594.0,,51.295937,-0.49676,6.0,1.0,0.0,0.01,1.0,0.0,0.0,
25%,29555180.0,26731760.0,,51.48424,-0.18906,75.0,1.0,0.0,0.15,1.0,0.0,0.0,
50%,8.123206e+17,112868400.0,,51.513791,-0.12699,135.0,2.0,4.0,0.5,2.0,93.0,0.0,
75%,1.197378e+18,406376200.0,,51.539099,-0.06788,225.0,4.0,19.0,1.23,8.0,270.0,6.0,
max,1.439673e+18,700129800.0,,51.68263,0.27896,74100.0,1125.0,1855.0,38.41,495.0,365.0,355.0,


In [32]:
df.head(n=3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,264776,Huge Four Bedroom Apartment,1389063,Sue,,Lewisham,51.44306,-0.01948,Entire home/apt,297.0,3,68,2025-05-28,0.51,11,293,12,
1,264777,One Bedroom Apartment,1389063,Sue,,Lewisham,51.44284,-0.01997,Entire home/apt,98.0,3,24,2024-12-11,0.22,11,318,4,
2,264778,Two Bedroom Newly Refurbished Apartment,1389063,Sue,,Lewisham,51.44359,-0.02275,Entire home/apt,148.0,3,58,2025-05-01,0.43,11,302,6,


In [33]:
df.isnull().sum()

id                                    0
name                                  0
host_id                               0
host_name                            40
neighbourhood_group               96651
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                             33967
minimum_nights                        0
number_of_reviews                     0
last_review                       25164
reviews_per_month                 25164
calculated_host_listings_count        0
availability_365                      0
number_of_reviews_ltm                 0
license                           96651
dtype: int64

In [34]:
## Lets check for duplicate id so we can potential discard repeated data

In [35]:
# Duplicate IDs
duplicate_ids = df['id'].duplicated().sum()
print(f"Number of Duplicate IDs: {duplicate_ids}")

Number of Duplicate IDs: 0


In [36]:
# Check for duplicate rows across all columns
duplicate_rows = df.duplicated().sum()
print(f"Number of duplicate rows across all columns: {duplicate_rows}")

Number of duplicate rows across all columns: 0


In [37]:
df.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group               float64
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                             float64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
number_of_reviews_ltm               int64
license                           float64
dtype: object

In [38]:
df["price"] = df["price"].astype(float)
df['price'].dtypes

dtype('float64')

In [39]:
print(df.dtypes)

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group               float64
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                             float64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
number_of_reviews_ltm               int64
license                           float64
dtype: object


# Data Cleaning

In [40]:
# Many price are missing? How do we handle missing prices?
missing_prices = df['price'].isnull().sum()

print(f"There are {missing_prices} missing prices")

There are 33967 missing prices


In [41]:
perc_null = missing_prices/len(df) * 100

print(f"The percentage of null values in the df is {perc_null:.2f}%")

The percentage of null values in the df is 35.14%


In [42]:
round(df['price'],2).head(3)

0    297.0
1     98.0
2    148.0
Name: price, dtype: float64

## Convert price fields (e.g., "$2,100.00") to float.

In [43]:
# Converting price from string with $ symbol to float even though the data looked clean already
df['price'] = (
    df['price']
    .astype(str)                
    .str.replace('$', '', regex=False)
    .str.replace(',', '', regex=False)
    .astype(float)
)

print(f"The data type of price is now {df['price'].dtypes}")

The data type of price is now float64


In [44]:
#Parse dates (e.g., last_review) into datetime objects.

df['last_review'] = pd.to_datetime(df['last_review'], errors ='coerce')

print(f"The data type of last-review is now {df['last_review'].dtypes}")

The data type of last-review is now datetime64[ns]


In [45]:
# Handle missing values in critical fields like reviews_per_month, host_name, and neighbourhood_group.

print(df['reviews_per_month'].isnull().sum())

print(df['host_name'].isnull().sum())

print(df['neighbourhood_group'].isnull().sum())

print(df['license'].isnull().sum())

print(len(df))

25164
40
96651
96651
96651


In [None]:
# I will be replacing missing reviews_per_month with 0 as it makes sense that no reviews means 0 reviews per month
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

# For host_name, I will replace missing values with 'Unknown'
df['host_name'] = df['host_name'].fillna('Unknown')

# I will drop the license and neighbourhood_group as they have missing value throughout the data - these columns are not useful for analysis
df = df.drop(columns=['license', 'neighbourhood_group'])

# I will check the last time a review was made and fill missing last_review with a date before the earliest review date in the dataset
earliest_review_date = df['last_review'].min()
df['last_review'] = df['last_review'].fillna(earliest_review_date - pd.Timedelta(days=1))

# I will check the price column again and fill missing prices with the median price of the dataset
median_price = df['price'].median()
df['price'] = df['price'].fillna(median_price)

# I will drop any price with 0 as it does not make sense to have a listing with 0 price
df = df[df['price'] > 0]

# I will also check for availability_365 and fill missing values with 0 as it makes sense that no availability means 0 days available
df['availability_365'] = df['availability_365'].fillna(0)   
# I will drop any availabilty_365 with zero as it does not make sense to have a listing with 0 days available
df = df[df['availability_365'] > 0] 

# Final check for any remaining missing values
print(df.isnull().sum())

# Final check for duplicates
df.duplicated().value_counts()

# Final shape of the cleaned dataframe
print(f"The final shape of the cleaned dataframe is {df.shape}")

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
number_of_reviews_ltm             0
dtype: int64


# Data Enrichment 
Create a price_per_booking column using price and minimum_nights.
Bucket availability into categories:
Full-time (availability > 300)
Part-time (100–300)
Rare (<100)

In [50]:
# Create a price_per_booking column using price and minimum_nights
df['price_per_booking'] = df['price'] * df['minimum_nights']

# lets check the new column
df[['price', 'minimum_nights', 'price_per_booking']].head(3)

Unnamed: 0,price,minimum_nights,price_per_booking
0,297.0,3,891.0
1,98.0,3,294.0
2,148.0,3,444.0


In [53]:
# Bucket availability into categories:
# Full-time (availability > 300)    
def categorize_availability(days):
    if days > 300:
        return 'Full-time'
    elif 100 <= days <= 300:
        return 'Part-time'
    else:
        return 'Rare'

df['availability_category'] = df['availability_365'].apply(categorize_availability)

# Check the new availability_category column
df[['availability_365', 'availability_category']].head(10)

Unnamed: 0,availability_365,availability_category
0,293,Part-time
1,318,Full-time
2,302,Full-time
3,328,Full-time
4,255,Part-time
5,274,Part-time
6,323,Full-time
7,295,Part-time
8,322,Full-time
9,27,Rare


In [55]:
# lets do a quick count of the new availability_category
availability_counts = df['availability_category'].value_counts()
print(availability_counts)

availability_category
Part-time    27572
Full-time    20099
Rare         18817
Name: count, dtype: int64


In [56]:
# Quick checks of the new columns
print(df.columns)

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood', 'latitude',
       'longitude', 'room_type', 'price', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365',
       'number_of_reviews_ltm', 'price_per_booking', 'availability_category'],
      dtype='object')


In [57]:
# Top 10 most expensive neighborhoods by average price

top10_expensive_neighborhoods = df.groupby('neighbourhood')['price'].mean().sort_values(ascending=False).head(10)
top10_expensive_neighborhoods

neighbourhood
City of London            367.467532
Lambeth                   352.092564
Kensington and Chelsea    350.425536
Westminster               329.347052
Camden                    223.209488
Islington                 211.186133
Hammersmith and Fulham    190.147409
Wandsworth                185.280902
Richmond upon Thames      182.666264
Brent                     167.645805
Name: price, dtype: float64

In [59]:
# Average availability and price by room type
avg_availability_price_by_room = df.groupby('room_type').agg({'availability_365': 'mean', 'price': 'mean'})
print(avg_availability_price_by_room.round(2))

availability_price_by_room = (
    df.groupby('room_type')[['availability_365', 'price']]
    .mean()
    .round(2)
)

print(availability_price_by_room)


                 availability_365   price
room_type                                
Entire home/apt            197.79  248.98
Hotel room                 249.55  286.15
Private room               213.48  121.19
Shared room                261.08   83.64
                 availability_365   price
room_type                                
Entire home/apt            197.79  248.98
Hotel room                 249.55  286.15
Private room               213.48  121.19
Shared room                261.08   83.64


In [62]:
# Quick checks of the new columns
print(df.columns)

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood', 'latitude',
       'longitude', 'room_type', 'price', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365',
       'number_of_reviews_ltm', 'price_per_booking', 'availability_category'],
      dtype='object')


In [69]:
# Which host has the most listings
top_host = df['host_name'].value_counts().idxmax()
top_host_count = df['host_name'].value_counts().max()   
print(f"The host with the most listings is {top_host} with {top_host_count} listings.")


top_hosts = (
    df.groupby('host_name')
    ['host_id']
    .value_counts()
    .head(10)
)

print(top_hosts)



The host with the most listings is James with 639 listings.
host_name            host_id  
'Connor              77121287      1
15 Basil Street      431540930     4
1Michelle            178269703     3
53 Degrees Property  309252376     3
56 Welbeck Street    217061287     1
A                    577866564    57
                     550934628     6
                     29779984      2
                     93789548      2
                     528138716     2
Name: count, dtype: int64


In [75]:
# Average price by borough or district

avg_price_by_neighbourhood = df.groupby('neighbourhood')['price'].mean().round(2).sort_values(ascending=False).head(10)
print(avg_price_by_neighbourhood)

avg_price_borough = (
    df.groupby('neighbourhood')['price']
    .mean()
    .round(2)
    .sort_values(ascending=False)
    .head(10)
)

print(avg_price_borough)

neighbourhood
City of London            367.47
Lambeth                   352.09
Kensington and Chelsea    350.43
Westminster               329.35
Camden                    223.21
Islington                 211.19
Hammersmith and Fulham    190.15
Wandsworth                185.28
Richmond upon Thames      182.67
Brent                     167.65
Name: price, dtype: float64
neighbourhood
City of London            367.47
Lambeth                   352.09
Kensington and Chelsea    350.43
Westminster               329.35
Camden                    223.21
Islington                 211.19
Hammersmith and Fulham    190.15
Wandsworth                185.28
Richmond upon Thames      182.67
Brent                     167.65
Name: price, dtype: float64


In [None]:
# Visualize the top 10 most expensive neighborhoods
#top10_expensive_neighborhoods.plot(kind='bar', title='Top 10 Most Expensive Neighborhoods', figsize=(8,4))


In [72]:
# How many listings have never been reviewed
never_reviewed_count = df['number_of_reviews'].value_counts().get(0, 0)
print(f"Number of listings that have never been reviewed: {never_reviewed_count}")

Number of listings that have never been reviewed: 15970


In [74]:
never_reviewed = df[df['number_of_reviews'] == 0]
count_never_reviewed = len(never_reviewed)

print(f"Listings never reviewed: {count_never_reviewed}")

perc_never_reviewed = (count_never_reviewed / len(df)) * 100
print(f"That's about {perc_never_reviewed:.2f}% of all listings.")

Listings never reviewed: 15970
That's about 24.02% of all listings.
