In [20]:
import pandas as pd 
import numpy as np
#make pandas display all columns
pd.set_option('display.max_columns', None)

san_diego = pd.read_csv('san_diego_listing_cleaned.csv')
san_diego.head()

Unnamed: 0,id,name,host_id,host_name,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
0,6,North Park Craftsmen House * Great for Families,29,Sara,2008-03-03,within an hour,100%,100%,f,t,North Hills,32.75522,-117.12873,Entire home,Entire home/apt,6,2.0,2 baths,3.0,3.0,"[""Hair dryer"", ""Children\u2019s bikes"", ""GE ga...",199.0,1,30,1,14,14,275,153,0,0,4.81,4.82,4.71,4.99,4.97,4.77,4.8,0.78
1,12447,Cozy Getaway Pacific Beach Oasis!,48669,Jennifer,2009-10-27,within an hour,100%,90%,t,t,Pacific Beach,32.80533,-117.234,Private room in condo,Private room,2,1.0,1 private bath,1.0,0.0,"[""Hair dryer"", ""Microwave"", ""Wifi"", ""Carbon mo...",96.0,6,365,5,17,17,162,2,1,1,5.0,4.0,5.0,5.0,5.0,5.0,5.0,0.01
2,29967,"Great home, 10 min walk to Beach",129123,Michael,2010-05-21,within an hour,100%,77%,t,t,Pacific Beach,32.80751,-117.2576,Entire bungalow,Entire home/apt,6,2.0,2 baths,2.0,3.0,"[""Free parking on premises"", ""Hair dryer"", ""Sh...",227.0,4,365,3,22,47,322,107,14,0,4.77,4.74,4.57,4.93,4.93,4.82,4.77,0.63
3,54001,"La Jolla Cottage, 3Blks 2 Ocean; 2bd1bth, Stei...",252692,Marsha,2010-10-04,within a few hours,100%,78%,t,t,La Jolla,32.81301,-117.26856,Entire guesthouse,Entire home/apt,2,1.0,1 bath,2.0,3.0,"[""Ceiling fan"", ""Hair dryer"", ""Portable heater...",191.0,4,30,1,7,20,49,330,21,2,4.93,4.92,4.94,4.97,4.96,4.97,4.88,2.01
4,62949,Pacific Beach Cozy Private Hideaway Free Bikes,21532,Lisa,2009-06-14,within an hour,100%,97%,t,t,Pacific Beach,32.80621,-117.23372,Entire guest suite,Entire home/apt,2,1.0,1 bath,0.0,0.0,"[""Hair dryer"", ""Portable heater"", ""Self check-...",115.0,1,4,6,20,41,41,1082,120,7,4.86,4.91,4.85,4.96,4.95,4.83,4.81,7.51


In [21]:
numeric_df = san_diego.drop(columns=["id", "name", "host_name", "amenities", "host_since", "bathrooms_text"], axis=1)

numeric_df["host_response_rate"] = numeric_df["host_response_rate"].str.replace("%", "").astype(float)
numeric_df["host_acceptance_rate"] = numeric_df["host_acceptance_rate"].str.replace("%", "").astype(float)

dummy_cols = ["host_response_time", "host_is_superhost", "host_identity_verified", "neighbourhood_cleansed", "property_type", "room_type"]
numeric_df = pd.get_dummies(numeric_df, columns=dummy_cols)

#print categorical columns
print(numeric_df.select_dtypes(include=['object']).columns)
for col in numeric_df.select_dtypes(include=['object']).columns:
    print(numeric_df[col].value_counts())


Index([], dtype='object')


In [33]:
#co
#Find correlation matrix
corr_matrix = numeric_df.corr()
print(corr_matrix.shape)

#Find the most correlated features with price
corr_price = corr_matrix["price"].sort_values(ascending=False)
print("10 most positively correlated features with price")
print(corr_price[:10])
print("10 most negatively correlated features with price")
print(corr_price[-10:])

(190, 190)
10 most positively correlated features with price
price                                 1.000000
bathrooms                             0.646012
bedrooms                              0.610507
accommodates                          0.595608
beds                                  0.561226
property_type_Entire home             0.306324
neighbourhood_cleansed_Mission Bay    0.232610
neighbourhood_cleansed_La Jolla       0.213011
room_type_Entire home/apt             0.208830
property_type_Entire villa            0.176031
Name: price, dtype: float64
10 most negatively correlated features with price
minimum_nights                       -0.099234
property_type_Entire guesthouse      -0.103768
number_of_reviews_l30d               -0.111351
number_of_reviews_ltm                -0.127882
property_type_Entire rental unit     -0.133304
number_of_reviews                    -0.141201
reviews_per_month                    -0.148438
property_type_Private room in home   -0.164061
room_type_Priva

# Correlations with price analysis

## Positively correlated

1. bathrooms and bedrooms are very positively correlated. Bigger stay = more expensive.
2. The neighborhoods of Mission Bay and La Jolla are the most expensive neighborhoods.
3. The room_type / property type eiother being an entire home or entire villa also increases price.

## Negatively correlated

1. Longitude in this dataset is negative, this negative correlation means house closer to the coast are more expensive, which makes sense. 
2. Private rooms are negatively correlated, indicating they are significantly cheaper than other types of stays. 
3. Multiple columns indicating the number of reviews in a certain time period are negatively correlated. Indicating that the more reviews a place gets the cheaper it will be. This could be because cheaper places have more number of stays in general, therefore more reviews.
4. Property types of "Entire guesthouse" and "Entire rental unit" also seem to be cheaper. 

In [35]:
corr_rating = corr_matrix["review_scores_rating"].sort_values(ascending=False)
print("10 most positively correlated features with review_scores_rating")
print(corr_rating[:10])
print("10 most negatively correlated features with review_scores_rating")
print(corr_rating[-10:])


#drop review_scores_* and host_is_superhost_* columns
corr_rating = corr_rating.drop(index=corr_rating.index[corr_rating.index.str.contains("review_scores")])
corr_rating = corr_rating.drop(index=corr_rating.index[corr_rating.index.str.contains("host_is_superhost")])
print("10 most positively correlated features with review_scores_rating without review_scores_* and host_is_superhost_* columns")
print(corr_rating[:10])
print("10 most negatively correlated features with review_scores_rating without review_scores_* and host_is_superhost_* columns")
print(corr_rating[-10:])

10 most positively correlated features with review_scores_rating
review_scores_rating           1.000000
review_scores_accuracy         0.847966
review_scores_value            0.833266
review_scores_cleanliness      0.781260
review_scores_communication    0.736617
review_scores_checkin          0.688059
review_scores_location         0.520114
host_is_superhost_t            0.311527
number_of_reviews_l30d         0.133770
number_of_reviews_ltm          0.113518
Name: review_scores_rating, dtype: float64
10 most negatively correlated features with review_scores_rating
property_type_Room in hotel         -0.082378
property_type_Entire rental unit    -0.084673
property_type_Shared room in home   -0.085829
room_type_Hotel room                -0.088814
maximum_nights                      -0.103015
availability_365                    -0.117738
availability_90                     -0.159873
availability_60                     -0.180590
availability_30                     -0.187202
host_is_super

# Rating Correlations

The columns review_scores_accuracy review_scores_value review_scores_cleanliness review_scores_communication  review_scores_checkin review_scores_location are all reviews for specific "categories", so it makes sense these would all be highly correlated with the overall review. Also host_is_superhost is purely based off of review value (>4.8 = superhost), so it makes sense the host_is_superhost_t and host_is_superhost_f is positively / negatively correlated. 

So I removed these columns are reprinted correlation coefficients

## Positive Correlations

1. Multiple columns indicating number of reviews means more positive overall review. This could be because better places get more people, that leave better reviews that attracts more people.
2. Property types of "home", "guest house", and "guest suite" get better reviews. Not sure why
3. Longitude is positively correlated, meaning houses closer to coast have better ratings
4. host_response_rate is positively correlated, makes sense a more attentive host is reviewed better

## Negative Correlations
1. Multiple availability columns are highly negatively correlated. Means worse reviews with more availability. It makes sense a place with bad reviews does not get booked often. 
2. Maximum nights is negatively correlated. Not sure why.
3. "Hotel room", "Shared room", "Entire rental unit" and "Room in hotel" are all highly negatively correlated. Maybe people do not like smaller stays. 