In [29]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
%matplotlib inline
df = pd.read_csv('Airbnb_data/listings_SanFan.csv')
print(list(df))
# u'zipcode',u'location_price',,u'instant_bookable', u'host_is_superhost',u'host_response_rate',
selected_features = [u'price',u'accommodates',u'host_response_time',
       u'bathrooms', u'bedrooms', u'beds',u'security_deposit', u'cleaning_fee', u'guests_included',
       u'extra_people', u'minimum_nights', u'maximum_nights',u'guests_included',  
       u'availability_365','latitude','longitude', 
       u'number_of_reviews', u'review_scores_rating',u'review_scores_cleanliness', u'review_scores_checkin',
       u'review_scores_communication', u'review_scores_location',
       u'review_scores_value', u'house_rules',u'amenities','bed_type', 'room_type', 'cancellation_policy', 'property_type']
df = df.loc[:, selected_features]

['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'monthly_price', '

## Preprocess
- Handle Missing Values
- Clean Zipcode
- Clean and convert String fields to Number: "Security_deposit", "extra_people", "availability_365"
- To derive "House Rule" attributes, we parse description to unique house rule such as "smoking", "pet",.., then convert the scale (True to False)

In [26]:
## fill NaN values with values that appear most often in the corresponding attributes
df.apply(lambda x:x.fillna(x.value_counts().index[0], inplace=True))
# df.fillna(method = 'backfill', inplace = True)
# df.dropna(inplace=True)

price                          None
accommodates                   None
host_response_time             None
bathrooms                      None
bedrooms                       None
beds                           None
security_deposit               None
cleaning_fee                   None
guests_included                None
extra_people                   None
minimum_nights                 None
maximum_nights                 None
guests_included                None
availability_365               None
latitude                       None
longitude                      None
number_of_reviews              None
review_scores_rating           None
review_scores_cleanliness      None
review_scores_checkin          None
review_scores_communication    None
review_scores_location         None
review_scores_value            None
house_rules                    None
amenities                      None
bed_type                       None
room_type                      None
cancellation_policy         

In [3]:
df['review_scores_rating'].describe()
df['zipcode'].unique()
def clean_zipcode(row):
    return row[:7]
df['zipcode'] = df.apply(clean_zipcode)
df['zipcode'].unique()

pd.to_numeric(df['zipcode'], errors = 'coerce')
df['zipcode'][df['zipcode'] == '60660-1448']
df['zipcode'] = df['zipcode'].apply(lambda x: str(x)[:6])
df['zipcode'].unique()
df['zipcode'].astype(float)
sns.distplot(df['zipcode'])
# len(df)

In [15]:
# FEES and PRICES
df['price'] = df['price'].str.replace("\$|,", "").astype(float)
df['security_deposit'] = df['security_deposit'].str.replace("\$|,", "").astype(float)
df['cleaning_fee'] = df['cleaning_fee'].str.replace("\$|,", "").astype(float)
df['extra_people'] = df['extra_people'].str.replace("\$|,", "").astype(float)
df['availability'] = df['availability_365'] / 365

In [16]:
# remove rows that have 'NaN'in key features
# solve NaN cells in unimportant attributes

remove_criteria = df['price'].isnull() | df['zipcode'].isnull()
df = df[-remove_criteria]

In [30]:
# HOUSE RULES
house_rules = df['house_rules'].str.lower()

smoking = house_rules.str.contains("smoke|smoking", na= False)
df.loc[:, 'smoking'] = - smoking # False: No smoking allowed

pet = house_rules.str.contains("pet", na=False)
df.loc[:, 'pet'] = - pet

party = house_rules.str.contains("party|parties", na=False)
df.loc[:, 'party'] = - party

guest = house_rules.str.contains("guest|guests", na=False)
df.loc[:, 'guest'] = - guest

# df = df.drop(['house_rules'], axis = 1)


In [18]:
import re
# AMENITIES
amenities = list(df['amenities'])
total = ','.join(amenities)
total = total.replace("{", "").replace("}","").replace("\"", "").split(",")
amenity_items = list(set(total))
amenity_items = list(filter(None, amenity_items))
for item in amenity_items:
    if re.match(r'translation',item):
        amenity_items.remove(item)

## Find nearby Airbnb in 5-mile radius 

In [None]:
import pysal
from pysal.cg.kdtree import KDTree    

locations = df[['latitude','longitude']].values.tolist()
tree = KDTree(locations, distance_metric='Arc', radius=pysal.cg.RADIUS_EARTH_MILES)
print(tree)
current_point = (41.97654639192563, -87.68493430737107)
# get all points within 1 mile of 'current_point'
indices = tree.query_ball_point(current_point, 5)
print(indices)
# for i in indices:
#     print(locations[i])


In [19]:
# Turn Amenities into OneHotEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

amenities = list(df['amenities'])
# print(amenity_items)
# amenities
# df['amenities'].value_counts()
new_table = pd.DataFrame(index = df.reset_index().values[:,0], columns = amenity_items).fillna(0)
# new_table.head()
for i in range(len(amenities)):
    for item in amenity_items:
        if item in amenities[i]:
            new_table.set_value(i, item, 1)
sum_table = np.array(new_table.sum())
ind = (-sum_table).argsort()[:60]
sort_table = new_table.sum().iloc[ind]
df = df.drop(['amenities'], axis = 1)
df = pd.concat([df, sort_table], axis = 1)

  del sys.path[0]


In [20]:
# reduce list of amenities

# fill NaN with most frequent values
df.head()

Unnamed: 0,price,accommodates,host_response_time,bathrooms,bedrooms,beds,security_deposit,cleaning_fee,guests_included,extra_people,...,bed_type,room_type,cancellation_policy,property_type,availability,smoking,pet,party,guest,0
2,80.0,6.0,within an hour,1.5,2.0,3.0,150.0,65.0,4.0,15.0,...,Real Bed,Private room,moderate,Townhouse,0.432877,True,True,True,True,
11,119.0,6.0,within an hour,2.0,3.0,4.0,250.0,79.0,4.0,25.0,...,Real Bed,Entire home/apt,strict,House,0.879452,False,False,False,False,
12,52.0,2.0,within a few hours,1.0,0.0,1.0,95.0,15.0,1.0,32.0,...,Futon,Entire home/apt,strict,Apartment,0.010959,True,True,True,True,
17,102.0,2.0,within a day,2.0,1.0,1.0,300.0,75.0,1.0,0.0,...,Real Bed,Entire home/apt,moderate,Apartment,0.052055,False,False,False,False,
22,139.0,4.0,within a few hours,2.0,2.0,2.0,300.0,80.0,3.0,20.0,...,Real Bed,Entire home/apt,moderate,Apartment,0.767123,False,False,False,False,


In [33]:
# pd.isnull(df['host_response_time'].iloc[5204])
# column_list = list(df['host_response_time'])
# print(column_list)

In [23]:
unique_values =list(df['host_response_time'].unique())[:-1]
unique_values
column_list = list(df['host_response_time'])
new_table = pd.DataFrame(index = df.reset_index().values[:,0], columns = unique_values).fillna(0)
new_table.head()
for i in range(len( df )):
        for item in unique_values:
            print((item,i))
            if item in column_list[i] and pd.isnull(column_list[i])==False:
                new_table.set_value(i, item, 1) 

('within an hour', 0)
('within a few hours', 0)
('within a day', 0)
('a few days or more', 0)
('within an hour', 1)
('within a few hours', 1)
('within a day', 1)
('a few days or more', 1)
('within an hour', 2)
('within a few hours', 2)
('within a day', 2)
('a few days or more', 2)
('within an hour', 3)
('within a few hours', 3)
('within a day', 3)
('a few days or more', 3)
('within an hour', 4)
('within a few hours', 4)
('within a day', 4)
('a few days or more', 4)
('within an hour', 5)
('within a few hours', 5)
('within a day', 5)
('a few days or more', 5)
('within an hour', 6)
('within a few hours', 6)
('within a day', 6)
('a few days or more', 6)
('within an hour', 7)
('within a few hours', 7)
('within a day', 7)
('a few days or more', 7)
('within an hour', 8)
('within a few hours', 8)
('within a day', 8)
('a few days or more', 8)
('within an hour', 9)
('within a few hours', 9)
('within a day', 9)
('a few days or more', 9)
('within an hour', 10)
('within a few hours', 10)
('within a

  # Remove the CWD from sys.path while we load stuff.


('within an hour', 83)
('within a few hours', 83)
('within a day', 83)
('a few days or more', 83)
('within an hour', 84)
('within a few hours', 84)
('within a day', 84)
('a few days or more', 84)
('within an hour', 85)
('within a few hours', 85)
('within a day', 85)
('a few days or more', 85)
('within an hour', 86)
('within a few hours', 86)
('within a day', 86)
('a few days or more', 86)
('within an hour', 87)
('within a few hours', 87)
('within a day', 87)
('a few days or more', 87)
('within an hour', 88)
('within a few hours', 88)
('within a day', 88)
('a few days or more', 88)
('within an hour', 89)
('within a few hours', 89)
('within a day', 89)
('a few days or more', 89)
('within an hour', 90)
('within a few hours', 90)
('within a day', 90)
('a few days or more', 90)
('within an hour', 91)
('within a few hours', 91)
('within a day', 91)
('a few days or more', 91)
('within an hour', 92)
('within a few hours', 92)
('within a day', 92)
('a few days or more', 92)
('within an hour', 9

('within a few hours', 216)
('within a day', 216)
('a few days or more', 216)
('within an hour', 217)
('within a few hours', 217)
('within a day', 217)
('a few days or more', 217)
('within an hour', 218)
('within a few hours', 218)
('within a day', 218)
('a few days or more', 218)
('within an hour', 219)
('within a few hours', 219)
('within a day', 219)
('a few days or more', 219)
('within an hour', 220)
('within a few hours', 220)
('within a day', 220)
('a few days or more', 220)
('within an hour', 221)
('within a few hours', 221)
('within a day', 221)
('a few days or more', 221)
('within an hour', 222)
('within a few hours', 222)
('within a day', 222)
('a few days or more', 222)
('within an hour', 223)
('within a few hours', 223)
('within a day', 223)
('a few days or more', 223)
('within an hour', 224)
('within a few hours', 224)
('within a day', 224)
('a few days or more', 224)
('within an hour', 225)
('within a few hours', 225)
('within a day', 225)
('a few days or more', 225)
('wi

('a few days or more', 305)
('within an hour', 306)
('within a few hours', 306)
('within a day', 306)
('a few days or more', 306)
('within an hour', 307)
('within a few hours', 307)
('within a day', 307)
('a few days or more', 307)
('within an hour', 308)
('within a few hours', 308)
('within a day', 308)
('a few days or more', 308)
('within an hour', 309)
('within a few hours', 309)
('within a day', 309)
('a few days or more', 309)
('within an hour', 310)
('within a few hours', 310)
('within a day', 310)
('a few days or more', 310)
('within an hour', 311)
('within a few hours', 311)
('within a day', 311)
('a few days or more', 311)
('within an hour', 312)
('within a few hours', 312)
('within a day', 312)
('a few days or more', 312)
('within an hour', 313)
('within a few hours', 313)
('within a day', 313)
('a few days or more', 313)
('within an hour', 314)
('within a few hours', 314)
('within a day', 314)
('a few days or more', 314)
('within an hour', 315)
('within a few hours', 315)
('

('a few days or more', 421)
('within an hour', 422)
('within a few hours', 422)
('within a day', 422)
('a few days or more', 422)
('within an hour', 423)
('within a few hours', 423)
('within a day', 423)
('a few days or more', 423)
('within an hour', 424)
('within a few hours', 424)
('within a day', 424)
('a few days or more', 424)
('within an hour', 425)
('within a few hours', 425)
('within a day', 425)
('a few days or more', 425)
('within an hour', 426)
('within a few hours', 426)
('within a day', 426)
('a few days or more', 426)
('within an hour', 427)
('within a few hours', 427)
('within a day', 427)
('a few days or more', 427)
('within an hour', 428)
('within a few hours', 428)
('within a day', 428)
('a few days or more', 428)
('within an hour', 429)
('within a few hours', 429)
('within a day', 429)
('a few days or more', 429)
('within an hour', 430)
('within a few hours', 430)
('within a day', 430)
('a few days or more', 430)
('within an hour', 431)
('within a few hours', 431)
('

('within a few hours', 530)
('within a day', 530)
('a few days or more', 530)
('within an hour', 531)
('within a few hours', 531)
('within a day', 531)
('a few days or more', 531)
('within an hour', 532)
('within a few hours', 532)
('within a day', 532)
('a few days or more', 532)
('within an hour', 533)
('within a few hours', 533)
('within a day', 533)
('a few days or more', 533)
('within an hour', 534)
('within a few hours', 534)
('within a day', 534)
('a few days or more', 534)
('within an hour', 535)
('within a few hours', 535)
('within a day', 535)
('a few days or more', 535)
('within an hour', 536)
('within a few hours', 536)
('within a day', 536)
('a few days or more', 536)
('within an hour', 537)
('within a few hours', 537)
('within a day', 537)
('a few days or more', 537)
('within an hour', 538)
('within a few hours', 538)
('within a day', 538)
('a few days or more', 538)
('within an hour', 539)
('within a few hours', 539)
('within a day', 539)
('a few days or more', 539)
('wi

('a few days or more', 681)
('within an hour', 682)
('within a few hours', 682)
('within a day', 682)
('a few days or more', 682)
('within an hour', 683)
('within a few hours', 683)
('within a day', 683)
('a few days or more', 683)
('within an hour', 684)
('within a few hours', 684)
('within a day', 684)
('a few days or more', 684)
('within an hour', 685)
('within a few hours', 685)
('within a day', 685)
('a few days or more', 685)
('within an hour', 686)
('within a few hours', 686)
('within a day', 686)
('a few days or more', 686)
('within an hour', 687)
('within a few hours', 687)
('within a day', 687)
('a few days or more', 687)
('within an hour', 688)
('within a few hours', 688)
('within a day', 688)
('a few days or more', 688)
('within an hour', 689)
('within a few hours', 689)
('within a day', 689)
('a few days or more', 689)
('within an hour', 690)
('within a few hours', 690)
('within a day', 690)
('a few days or more', 690)
('within an hour', 691)
('within a few hours', 691)
('

('within a few hours', 809)
('within a day', 809)
('a few days or more', 809)
('within an hour', 810)
('within a few hours', 810)
('within a day', 810)
('a few days or more', 810)
('within an hour', 811)
('within a few hours', 811)
('within a day', 811)
('a few days or more', 811)
('within an hour', 812)
('within a few hours', 812)
('within a day', 812)
('a few days or more', 812)
('within an hour', 813)
('within a few hours', 813)
('within a day', 813)
('a few days or more', 813)
('within an hour', 814)
('within a few hours', 814)
('within a day', 814)
('a few days or more', 814)
('within an hour', 815)
('within a few hours', 815)
('within a day', 815)
('a few days or more', 815)
('within an hour', 816)
('within a few hours', 816)
('within a day', 816)
('a few days or more', 816)
('within an hour', 817)
('within a few hours', 817)
('within a day', 817)
('a few days or more', 817)
('within an hour', 818)
('within a few hours', 818)
('within a day', 818)
('a few days or more', 818)
('wi

('within a few hours', 969)
('within a day', 969)
('a few days or more', 969)
('within an hour', 970)
('within a few hours', 970)
('within a day', 970)
('a few days or more', 970)
('within an hour', 971)
('within a few hours', 971)
('within a day', 971)
('a few days or more', 971)
('within an hour', 972)
('within a few hours', 972)
('within a day', 972)
('a few days or more', 972)
('within an hour', 973)
('within a few hours', 973)
('within a day', 973)
('a few days or more', 973)
('within an hour', 974)
('within a few hours', 974)
('within a day', 974)
('a few days or more', 974)
('within an hour', 975)
('within a few hours', 975)
('within a day', 975)
('a few days or more', 975)
('within an hour', 976)
('within a few hours', 976)
('within a day', 976)
('a few days or more', 976)
('within an hour', 977)
('within a few hours', 977)
('within a day', 977)
('a few days or more', 977)
('within an hour', 978)
('within a few hours', 978)
('within a day', 978)
('a few days or more', 978)
('wi

('within a few hours', 1093)
('within a day', 1093)
('a few days or more', 1093)
('within an hour', 1094)
('within a few hours', 1094)
('within a day', 1094)
('a few days or more', 1094)
('within an hour', 1095)
('within a few hours', 1095)
('within a day', 1095)
('a few days or more', 1095)
('within an hour', 1096)
('within a few hours', 1096)
('within a day', 1096)
('a few days or more', 1096)
('within an hour', 1097)
('within a few hours', 1097)
('within a day', 1097)
('a few days or more', 1097)
('within an hour', 1098)
('within a few hours', 1098)
('within a day', 1098)
('a few days or more', 1098)
('within an hour', 1099)
('within a few hours', 1099)
('within a day', 1099)
('a few days or more', 1099)
('within an hour', 1100)
('within a few hours', 1100)
('within a day', 1100)
('a few days or more', 1100)
('within an hour', 1101)
('within a few hours', 1101)
('within a day', 1101)
('a few days or more', 1101)
('within an hour', 1102)
('within a few hours', 1102)
('within a day', 

('within a few hours', 1220)
('within a day', 1220)
('a few days or more', 1220)
('within an hour', 1221)
('within a few hours', 1221)
('within a day', 1221)
('a few days or more', 1221)
('within an hour', 1222)
('within a few hours', 1222)
('within a day', 1222)
('a few days or more', 1222)
('within an hour', 1223)
('within a few hours', 1223)
('within a day', 1223)
('a few days or more', 1223)
('within an hour', 1224)
('within a few hours', 1224)
('within a day', 1224)
('a few days or more', 1224)
('within an hour', 1225)
('within a few hours', 1225)
('within a day', 1225)
('a few days or more', 1225)
('within an hour', 1226)
('within a few hours', 1226)
('within a day', 1226)
('a few days or more', 1226)
('within an hour', 1227)
('within a few hours', 1227)
('within a day', 1227)
('a few days or more', 1227)
('within an hour', 1228)
('within a few hours', 1228)
('within a day', 1228)
('a few days or more', 1228)
('within an hour', 1229)
('within a few hours', 1229)
('within a day', 

TypeError: argument of type 'float' is not iterable

In [None]:
columns = ['bed_type', 'room_type', 'cancellation_policy', 'property_type', 'host_response_time']

for column in columns:  
    unique_values = list(df[column].unique())
    column_list = list(df[column])
    new_table = pd.DataFrame(index = df.reset_index().values[:,0], columns = unique_values).fillna(0)
    
    for i in range(len( column_list )):
        print(column_list)
        for item in unique_values:
            print(item)
            if item in column_list[i]:
                new_table.set_value(i, item, 1)  
    df = pd.concat( [df, new_table], axis = 1)
    df = df.drop([column], axis = 1)         
# df.columns.values

In [None]:
# 'date'	neighborhood	Safety of neighborhood	Variable cost	Initial Investment	Accomodation	Capacity	Price per day

In [None]:
df.dtypes
# df['cancellation_policy'].unique()

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

cluster_data=df.ix[:,2:]
X=cluster_data
y=df.ix[:,'price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

tunedParameters=[{'n_estimators':range(50,100,10)}]
clf=GridSearchCV(RandomForestRegressor(n_jobs = -1, criterion='mse'), param_grid = tunedParameters,cv=10)

clf.fit(X, y)

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [None]:
X=cluster_data
y=df.ix[:,'price']
from collections import OrderedDict
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
ensemble_clfs = [("RandomForestClassifier, max_features=None",
        RandomForestClassifier(warm_start=True, max_features=None,
                               oob_score=True,
                               random_state=0))]

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# Range of `n_estimators` values to explore.
min_estimators = 10
max_estimators = 100

for label, clf in ensemble_clfs:
    for i in range(min_estimators, max_estimators + 1):
        clf.set_params(n_estimators=i)
        clf.fit(X, y)

        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = 1 - clf.oob_score_
        error_rate[label].append((i, oob_error))

# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()

from sklearn.ensemble import RandomForestRegressor

X=cluster_data
y=df.ix[:,'price']
tunedParameters = [{'n_estimators':100}]

clf2 = RandomForestRegressor(n_jobs = 1, criterion='mse', n_estimators=100)
#Fit Model
clf2.fit(X, y)

In [None]:
FeatImp = pd.DataFrame({'feature': list(X.columns), 'importance': list(clf2.feature_importances_)})
FeatImp = FeatImp.sort_values('importance', ascending = False)
#Set Index To Field You want to Sort Bar Chart By
FeatImp = FeatImp.set_index('feature')
FeatImp.head(100)
FeatImp.to_csv('feature_imp.csv')

In [None]:
FeatImp.index[0:30]

In [None]:
FeatImp['importance'].values[0:30]

In [None]:
import matplotlib.pyplot as plt
plt.figure()

plt.title("Feature Importance")
y_pos = np.arange(len(FeatImp.index[0:30]))
plt.bar(y_pos,FeatImp['importance'].values[0:30])
plt.xticks(y_pos, FeatImp.index[0:30],rotation='vertical')
plt.show()

In [None]:
import matplotlib.pyplot as plt

fig=plt.figure(figsize=(17,10))
# df['price'] = df.price.str.replace("\$|,", "").astype(float)
# df['price'].hist()
sns.distplot(df['price'])
# plt.show()
mean_price = df.price.iloc[:5].mean()
mean_price
# df['host_acceptance_rate'].head()

In [None]:
# df['review_scores_rating'].fillna(0, inplace=True)
df['review_scores_rating'].dropna(axis=0, inplace=True)
sns.distplot(df['review_scores_rating'])


In [None]:
# Correlation Plot
var = 'review_scores_rating'
data = pd.concat([df['price'], df[var]], axis=1)
data.plot.scatter(x=var, y='price')
data.plot.scatter(x=var, y='price', ylim=(0,1500))

In [None]:
# cluster by locations
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans

def plot_3D_clusters(X, k):
    """Plot 3 attributes in dataset to explore natural clusters within data"""
    estimators = {'k_means_3': KMeans(n_clusters=k)}
    fignum = 1
    for name, est in estimators.items():
        fig = plt.figure(fignum, figsize=(4, 3))
        plt.clf()
        ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
        plt.cla()
        est.fit(X)
        labels = est.labels_
        # Change the 2nd column in X[:,_] to choose attributes for plotting
        ax.scatter(X[:,0], X[:,1], X[:,2], c=labels.astype(np.float),edgecolor='k')
        ax.w_xaxis.set_ticklabels([])
        ax.w_yaxis.set_ticklabels([])
        ax.w_zaxis.set_ticklabels([])
        ax.set_xlabel('latitude')
        ax.set_ylabel('longitude')
        ax.set_zlabel('price')
        fignum = fignum + 1
    plt.show()
# change the n
df = pd.read_csv('Airbnb_data/listings_SanFan.csv')
df['price'] = df['price'].str.replace("\$|,", "").astype(float)
X = np.array(df[['latitude', 'longitude', 'price']])
plot_3D_clusters(X, 4)

In [None]:
# Regression model
import statsmodels.api as sm # import statsmodels 

In [None]:
df['price'].describe()
df['host_acceptance_rate'].describe()
df['host_response_rate']

In [None]:
# Transformation
# right skewed: log transform
# left skewed: power transform


In [None]:
# Create train and test data
train_df, test_df = 
norm_
norm_train_df

In [None]:
from sklearn.neighbors import KNeighborRegressor
# Create KNN model: 5 closest neighbors
knn = KNeighborRegressor(algorithm='brute', n_neighbors = 5)
cols = ['accommodates', 'bedrooms', 'bathrooms', 'beds']
knn.fit(norm_train_df[col], norm_train_df['price'])
features_predictions = knn.predict(norm_test_df[cols])
features_mse = mean_squared_error(norm_test_df['price'], features_predictions)
features_rmse = features_mse ** (1/2)
features_rmse