# Simplifying AirBNB Dataset

This wil be my notebook connecting my past project about predicting AirBNB price to my next project about using spatial information from Foursquare API

Since previous model is too heavy on data understanding and cleaning, I will simplify the dataset using my insight from my previous project

In [177]:
import pandas as pd

In [196]:
df = pd.read_csv('air_clean.csv')

In [197]:
df.columns

Index(['id', 'host_is_superhost', 'host_identity_verified',
       'neighbourhood_group_cleansed', 'property_type', 'room_type', 'price',
       'guests_included', 'extra_people', 'minimum_nights',
       ...
       'Iron', 'Hair dryer', 'Kitchen', 'Hangers', 'Washer', 'Essentials',
       'total_amenities', 'listing_duration', 'hosting_duration',
       'is_singaporean'],
      dtype='object', length=101)

In [220]:
selected_features = ['id', 'price', 'property_type', 'room_type', 'listing_duration', 'hosting_duration', 'calculated_host_listings_count', 'guests_included', 'extra_people', 'availability_365', 'availability_60', 'total_amenities', 'Pool', 'Family/kid friendly', 'Dryer', 'TV', 'Breakfast', 'Smoke detector', 'Free parking on premises', 'Paid parking on premises', 'Essentials', 'Pets allowed', 'Shampoo', 'Gym', 'Suitable for events', 'Free street parking', 'Laptop friendly workspace', 'minimum_nights', 'maximum_nights', 'neighbourhood_group_cleansed', 'number_of_reviews_ltm', 'reviews_per_month', 'review_scores_location', 'review_scores_cleanliness', 'review_scores_accuracy', 'review_scores_value']
df_simplified = df[selected_features]
df_simplified.head()

Unnamed: 0,id,price,property_type,room_type,listing_duration,hosting_duration,calculated_host_listings_count,guests_included,extra_people,availability_365,...,Laptop friendly workspace,minimum_nights,maximum_nights,neighbourhood_group_cleansed,number_of_reviews_ltm,reviews_per_month,review_scores_location,review_scores_cleanliness,review_scores_accuracy,review_scores_value
0,49091,81,Apartment,Private room,0.0,1097.0,2,1,14,365,...,0,180,360,North Region,0,0.01,8.0,10.0,10.0,8.0
1,50646,80,Apartment,Private room,252.0,1570.0,1,2,20,365,...,0,90,730,Central Region,0,0.28,9.0,10.0,9.0,9.0
2,56334,68,Apartment,Private room,1559.0,1807.0,2,1,14,365,...,0,6,14,North Region,0,0.21,8.0,10.0,10.0,9.0
3,71609,200,Tourism,Private room,2575.0,2899.0,9,4,27,353,...,1,1,1125,East Region,2,0.13,8.0,8.0,8.0,8.0
4,71896,92,House,Private room,2206.0,2388.0,9,1,20,353,...,0,1,1125,East Region,0,0.21,8.0,8.0,8.0,8.0


# Let's Check the new dataframe scores

In [213]:
dummy_variables = pd.get_dummies(df_simplified[['neighbourhood_group_cleansed', 'property_type', 'room_type']])
one_hot_df = pd.concat([df_simplified, dummy_variables], axis=1)
one_hot_df.drop(['neighbourhood_group_cleansed', 'property_type', 'room_type'], axis=1, inplace = True)
one_hot_df.set_index('id', drop=True, inplace=True)
one_hot_df.head()

Unnamed: 0_level_0,price,listing_duration,hosting_duration,calculated_host_listings_count,guests_included,extra_people,availability_365,availability_60,total_amenities,Pool,...,property_type_Hostel,property_type_Hotel,property_type_House,property_type_Other,property_type_Serviced apartment,property_type_Tourism,property_type_Townhouse,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
49091,81,0.0,1097.0,2,1,14,365,60,10,0,...,0,0,0,0,0,0,0,0,1,0
50646,80,252.0,1570.0,1,2,20,365,60,14,1,...,0,0,0,0,0,0,0,0,1,0
56334,68,1559.0,1807.0,2,1,14,365,60,11,0,...,0,0,0,0,0,0,0,0,1,0
71609,200,2575.0,2899.0,9,4,27,353,51,27,0,...,0,0,0,0,0,1,0,0,1,0
71896,92,2206.0,2388.0,9,1,20,353,51,23,0,...,0,0,1,0,0,0,0,0,1,0


In [214]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

y = one_hot_df['price'] # Target
X = one_hot_df.drop(['price'], axis = 1) # Predictors

# Standardize
scaler = StandardScaler()

X_train_o, X_test_o, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 47)

X_train = scaler.fit_transform(X_train_o)
X_test = scaler.transform(X_test_o)

In [215]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
print(mean_squared_error(rf.predict(X_train), y_train))
print(mean_squared_error(rf.predict(X_test), y_test))
print(rf.score(X_test, y_test))

204.8154934642528
1502.9378123836793
0.7602444822453325


# Adding latitude and longitude data

In [221]:
latlng_df = pd.read_csv('Airbnb listing.csv', encoding='latin')

  interactivity=interactivity, compiler=compiler, result=result)


In [222]:
latlng_df = latlng_df[['id', 'latitude', 'longitude']]
latlng_df.head()

Unnamed: 0,id,latitude,longitude
0,49091,1.44255,103.7958
1,50646,1.33235,103.78521
2,56334,1.44246,103.79667
3,71609,1.34541,103.95712
4,71896,1.34567,103.95963


In [223]:
df_simplified_merged = df_simplified.merge(latlng_df, how = 'left', on='id')

In [224]:
df_simplified_merged.head()

Unnamed: 0,id,price,property_type,room_type,listing_duration,hosting_duration,calculated_host_listings_count,guests_included,extra_people,availability_365,...,maximum_nights,neighbourhood_group_cleansed,number_of_reviews_ltm,reviews_per_month,review_scores_location,review_scores_cleanliness,review_scores_accuracy,review_scores_value,latitude,longitude
0,49091,81,Apartment,Private room,0.0,1097.0,2,1,14,365,...,360,North Region,0,0.01,8.0,10.0,10.0,8.0,1.44255,103.7958
1,50646,80,Apartment,Private room,252.0,1570.0,1,2,20,365,...,730,Central Region,0,0.28,9.0,10.0,9.0,9.0,1.33235,103.78521
2,56334,68,Apartment,Private room,1559.0,1807.0,2,1,14,365,...,14,North Region,0,0.21,8.0,10.0,10.0,9.0,1.44246,103.79667
3,71609,200,Tourism,Private room,2575.0,2899.0,9,4,27,353,...,1125,East Region,2,0.13,8.0,8.0,8.0,8.0,1.34541,103.95712
4,71896,92,House,Private room,2206.0,2388.0,9,1,20,353,...,1125,East Region,0,0.21,8.0,8.0,8.0,8.0,1.34567,103.95963


In [225]:
df_simplified_merged.to_csv('airbnb_simplified.csv')

# Conclusion

It's good enough! Simplifying the features only reduce the accuracy about 0.4%. I will use the simplified version so that my next project is easier to understand