# Exploring the availability of Airbnb listings and their characteristics
## How can we promote earlier booking of listings and also discourage hosts from placing listings that are actually not available for rent?

In [260]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## Load dataset

In [261]:
# Load datasets
source="./airbnb_data/"
calendar=pd.read_csv(source+"calendar.csv")
listings=pd.read_csv(source+"listings.csv")

In [262]:
calendar.head()

Unnamed: 0,listing_id,date,available,price,metro_area
0,2515,2018-03-05,t,69.0,NYC
1,2515,2018-03-04,t,69.0,NYC
2,2515,2018-03-03,t,69.0,NYC
3,2515,2018-03-02,t,69.0,NYC
4,2515,2018-03-01,t,69.0,NYC


## Clean dataset
### 1) Filter and clean data of interest

In [263]:
print(calendar['metro_area'].unique().tolist())

['NYC', 'denver', 'chicago', 'boston', 'dc']


In [264]:
# Filter only NYC data
calendar = calendar[calendar['metro_area']=='NYC']

# In 'available' column, change true to 1 and false to 0
calendar['available'] = calendar['available'].apply(lambda x:1 if x == 't' else 0)

# Convert date column to data type date
calendar['date'] = pd.to_datetime(calendar['date'])

calendar.head()

Unnamed: 0,listing_id,date,available,price,metro_area
0,2515,2018-03-05,1,69.0,NYC
1,2515,2018-03-04,1,69.0,NYC
2,2515,2018-03-03,1,69.0,NYC
3,2515,2018-03-02,1,69.0,NYC
4,2515,2018-03-01,1,69.0,NYC


### 2) Create table of available days by month and listing

For each listing, the availability for the next 365 calendar days are stored in the calendar dataset. Availability means that the listing is made available for rental by the host but has not been booked. We also recognise that there is may be an insignificant number of hosts who may have made the listing available for rental but are not actually accepting bookings due to various reasons. Non-availability could mean that the listing has already been booked or it was not made available for rental by the host. 

In [265]:
# Find start_date for dataset of each listing
# start_date = calendar.groupby('listing_id')['date'].min().to_frame()
# start_date = start_date.reset_index()
# start_date.head()

# We can check that all the start dates are in May 2017
# np.mean(start_date['date'].dt.month)

In [None]:
# Sum total number of days in a month with data and number of days which listing is available for
availability = calendar.groupby([pd.Grouper(key='date',freq='M'),'listing_id'])['available'].agg(['sum','count']).reset_index()
availability['not_available_days'] = availability['count'] - availability['sum']
availability = availability.rename({'sum':'available_days','count':'total_days'}, axis=1)
availability.head()

In [None]:
len(availability)

### 3) Focus on listings & months with low booking rate
We find that a significant percentage of listings and months are available for almost a full month. That is, the listings are shown as available on the calendar on Airbnb but are not booked at all/have extremely low booking rates. We choose to focus on examining why by exploring their characteristics and identifying patterns.

In [None]:
# Number of unique listing ids in dataset
print("Number of unique listings: {}".format(len(availability['listing_id'].unique().tolist())))

# Percentage of listing months which have 0 booking days
print("% of listing-months with zero bookings: {}".format(len(availability[availability['not_available_days'] == 0])/len(availability)*100))

# Create dataset of high-availability months and listings
no_booking = availability[availability['not_available_days']==0]
print("Ave # months/listing with zero bookings: {}".format(len(no_booking)/len(no_booking['listing_id'].unique().tolist())))

In [None]:
no_booking.head()

In [None]:
len(no_booking['listing_id'].unique().tolist())

### 4) Create dataset with unique listings and adjustment factor
Since the data ends sometime in May 2018, the count of frequency based on booked_days = 0 may result in an additional month being counted. We calculate an adjustment factor, which is 1 if a month needs to be deducted from the frequency, and 0 if no deduction needs to be made. The adjustment factor is 1 when booked_days = 0 in BOTH May 2017 and May 2018.

In [None]:
may_data = no_booking[no_booking['date'].dt.month == 5]
may_data = may_data.groupby('listing_id')['not_available_days'].sum().reset_index()
may_data.head()

In [None]:
adjustment = pd.DataFrame(no_booking['listing_id'].unique())
adjustment = adjustment.rename({0:'listing_id'},axis=1)
adjustment['factor'] = may_data['not_available_days'].apply(lambda x:1 if x==0 else 0)
adjustment['factor'] = adjustment['factor'].fillna(0)
adjustment.shape

### 5) Create dataset of unique high-availability listings with count of months

In [None]:
print(len(no_booking))
print(len(no_booking['listing_id'].unique().tolist()))

In [None]:
# Listings with zero-booking months (non-unique listings)
no_booking_mth = no_booking.drop(['available_days','not_available_days','total_days','date'], axis=1)
no_booking_mth.head()

In [None]:
# Add column of frequency i.e. number of zero-booking months per unique listing
no_booking_mth = no_booking_mth['listing_id'].value_counts().reset_index(name='no_months').rename(columns={'index': 'listing_id'})

# Merge no_booking_mth with adjustment
no_booking_mth = no_booking_mth.merge(adjustment, on='listing_id', how='inner')

# Calculate actual number of months and remove adjustment row
no_booking_mth['no_months'] = no_booking_mth['no_months']-no_booking_mth['factor']
no_booking_mth = no_booking_mth.drop('factor',axis=1)
no_booking_mth.no_months = no_booking_mth.no_months.astype(int)

In [None]:
no_booking_mth.head()

In [None]:
# Check that number of unique listings matches that in low_occ table
print(len(no_booking_mth))

We plot a histogram of the zero-booking months and see that a significant number of unique listings have a full year of zero-booking months. There could be various reasons why these listings are experiencing a full year of zero bookings. Possible reasons:  
1) The quality of the listing could be poor such that no one books them, in which case we can explore reasons for this and characterise such listings to make recommendations to hosts.  
2) The listings are outdated and no longer maintained by hosts, in which case it would be helpful to remove them fromt he website so that potential customers do not have to sift through unnecessarily large numbers of listings.  
3) Related to point 2, the hosts are currently staying in their listing and indicate the listing as being available even though it is not. They reject requests to rent. It would be helpful to motivate hosts to update accurate availabilities to enhance customer experience and efficiency of matching customers to listings.

In [None]:
plt.hist(no_booking_mth['no_months'])
plt.xlabel('Number of months with all days available')
plt.ylabel('Count of listings')

In [None]:
# Export dataset of listings and number of zero-booking months
no_booking_mth.to_csv(r'no_booking_listings.csv', index=False)

## Merge datasets
Merge listings and no_booking_mth to characterise the listings with low booking rates  

In [None]:
listings.head()

In [None]:
print(listings.columns)

In [None]:
# Filter only listings in New York
listings_new = listings[listings['state']=='NY'] 

# Drop unnecessary columns
listings_new = listings_new.drop(['has_availability','availability_30','host_id','name','weekly_price','state'],axis=1)

In [None]:
listings_new.shape

In [None]:
no_booking_list = no_booking_mth.merge(listings_new, left_on='listing_id', right_on='id', how='inner')
no_booking_list = no_booking_list.drop(['id'], axis=1)

In [None]:
no_booking_list.head()

The number of high-availability months is not highly correlated to any particular feature of the listing, as shown below.

In [None]:
corr = no_booking_list.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);