In [115]:
# TODO
# Use Toronto data: http://insideairbnb.com/get-the-data/

AirBnb Seattle

- #1: Does higher price mean higher rating? What if we bin listings according to price: Highest, High, Medium, Low, Lowest. And then review the correlations to ratings in each group?
- #2: What if bin listings according to date? What are the price trends through the seasons?
- What if bin listing according to neighborhood. Are there any trends in each neighborhood? Proportion of superhosts in each neighborhood?
- Aside from review what characteristics can be helpful in predicting a hosts rating?
- Can those same factors be used to classify whether a host is a superhost?

In [116]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style('whitegrid')

In [117]:
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',500)

In [None]:
calendar_df = pd.read_csv("./seattle_data/calendar.csv")
listings_df = pd.read_csv("./seattle_data/listings.csv")
reviews_df = pd.read_csv("./seattle_data/reviews.csv")

In [119]:
def explore_data(df):
    print(">>> New DataFrame <<< \n")
    print("df.head(): \n",df.head())
    print("*"*20)
    print("df.shape: \n", df.shape)
    print("*"*20)
    print("df.describe(): \n",df.describe())
    print("*"*20)
    print("df.dtypes: \n",df.dtypes)
    print("*"*20)
    print("df.columns: \n",df.columns)
    print("*"*20)
    print("df.index: \n",df.index)
    print("*"*20)
    print("df.insa().sum(): \n",df.isna().sum())

In [120]:
# df_list = [calendar_df,listings_df,reviews_df]
# for frame in df_list:
#     explore_data(frame)

    

Prepare Data For Analysis

In [121]:
cal_df = calendar_df.copy()
lst_df = listings_df.copy()
rev_df = reviews_df.copy()


Calendar DataFrame

- Duplicate listing_id as a listing may be available on various dates.
- The price of a listing_id may differ depending on date.

In [122]:
cal_df.dtypes

listing_id     int64
date          object
available     object
price         object
dtype: object

In [123]:
cal_df.isna().sum()

listing_id         0
date               0
available          0
price         459028
dtype: int64

In [124]:
cal_df.dropna(inplace=True)

In [125]:
cal_df['date'] = pd.to_datetime(calendar_df['date'])

In [126]:
cal_df['price'] = cal_df['price'].str.slice(start=1)
cal_df['price'] = cal_df['price'].replace(',','',regex=True)
cal_df['price'] = pd.to_numeric(cal_df['price'], errors='coerce')

In [127]:
cal_df.shape

(934542, 4)

In [128]:
cal_df['available'].nunique()

1

#1: Does higher price mean higher rating? What if we bin listings according to price: Highest, High, Medium, Low, Lowest. And then review the correlations to ratings in each group?

In [129]:
calendar = cal_df.drop(['available','date'],axis=1)
calendar.head()

Unnamed: 0,listing_id,price
0,241032,85.0
1,241032,85.0
9,241032,85.0
10,241032,85.0
14,241032,85.0


In [130]:
calendar['price_category'] = pd.qcut(calendar['price'],q=5,\
                        labels=['Lowest','Low','Medium','High','Highest']) 

In [131]:
calendar.head()

Unnamed: 0,listing_id,price,price_category
0,241032,85.0,Low
1,241032,85.0,Low
9,241032,85.0,Low
10,241032,85.0,Low
14,241032,85.0,Low


In [132]:
ratings = lst_df[['id','review_scores_rating']].dropna(axis=0)

In [141]:
calendar_join_ratings = pd.merge(left=calendar,\
                                right=ratings,\
                                left_on="listing_id",\
                                right_on="id")

In [143]:
calendar_join_ratings.head()

Unnamed: 0,listing_id,price,price_category,id,review_scores_rating
0,241032,85.0,Low,241032,95.0
1,241032,85.0,Low,241032,95.0
2,241032,85.0,Low,241032,95.0
3,241032,85.0,Low,241032,95.0
4,241032,85.0,Low,241032,95.0


In [145]:
joined = calendar_join_ratings.drop(['listing_id','id'],axis=1)
joined.head()

Unnamed: 0,price,price_category,review_scores_rating
0,85.0,Low,95.0
1,85.0,Low,95.0
2,85.0,Low,95.0
3,85.0,Low,95.0
4,85.0,Low,95.0


In [146]:
price_category_grouped = joined.groupby(['price_category'],as_index=False)

In [148]:
for name,grp in price_category_grouped:
    print(name)
    print(grp[['price','review_scores_rating']].corr())
    print("*"*20)

('Lowest',)
                         price  review_scores_rating
price                 1.000000              0.083507
review_scores_rating  0.083507              1.000000
********************
('Low',)
                         price  review_scores_rating
price                 1.000000              0.018733
review_scores_rating  0.018733              1.000000
********************
('Medium',)
                         price  review_scores_rating
price                 1.000000              0.052364
review_scores_rating  0.052364              1.000000
********************
('High',)
                         price  review_scores_rating
price                 1.000000             -0.003879
review_scores_rating -0.003879              1.000000
********************
('Highest',)
                         price  review_scores_rating
price                 1.000000              0.084067
review_scores_rating  0.084067              1.000000
********************


No significant correlations in the different price groups. As as a sanity check let's review for all listings.

In [151]:
calendar_join_ratings[['price','review_scores_rating']].corr()

Unnamed: 0,price,review_scores_rating
price,1.0,0.079414
review_scores_rating,0.079414,1.0


- #2: What if bin listings according to date? What are the price trends through the seasons?