In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import sklearn
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import explained_variance_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

import pickle
import xgboost as xgb
from xgboost import plot_importance

from sklearn.neighbors import NearestNeighbors
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [48]:
df = pd.read_pickle('data_similarlisting.pkl')

In [49]:
df = df[['listing_url','price','borough', 'instant_bookable', 'room_type', 'property_type',
        'host_is_superhost', 'host_total_listings_count',
        'accommodates', 'bedrooms', 'bathrooms',
        'minimum_nights', 'maximum_nights', 'security_deposit', 'cleaning_fee', 'coffee_machine', 'outdoor_space']]

In [42]:
df.to_pickle('static/data_nn.pkl')

In [50]:
df.head()

Unnamed: 0,listing_url,price,borough,instant_bookable,room_type,property_type,host_is_superhost,host_total_listings_count,accommodates,bedrooms,bathrooms,minimum_nights,maximum_nights,security_deposit,cleaning_fee,coffee_machine,outdoor_space
0,https://www.airbnb.com/rooms/2318,296.0,Madrona,1,Entire home/apt,House,1,2.0,9,4.0,2.5,1,1125,500.0,250.0,1.0,1.0
1,https://www.airbnb.com/rooms/9419,62.0,Georgetown,0,Private room,Apartment,1,8.0,2,1.0,3.0,2,180,100.0,20.0,1.0,0.0
2,https://www.airbnb.com/rooms/9531,165.0,Fairmount Park,0,Entire home/apt,House,1,2.0,4,2.0,1.0,3,730,400.0,120.0,1.0,1.0
3,https://www.airbnb.com/rooms/9534,125.0,Fairmount Park,0,Entire home/apt,Other,1,2.0,3,2.0,1.0,2,365,400.0,95.0,1.0,1.0
4,https://www.airbnb.com/rooms/9596,120.0,Wallingford,0,Entire home/apt,Apartment,0,5.0,4,1.0,1.0,30,60,200.0,85.0,1.0,1.0


In [11]:
url = df['listing_url']
X = df.drop(columns = ['listing_url', 'price'], axis = 1)

In [54]:
a = ['name','age','year']
b = 'good'
[b]+a

['good', 'name', 'age', 'year']

In [13]:
X.loc[2578]

price                                    115
borough                             Belltown
instant_bookable                           0
room_type                    Entire home/apt
property_type                      Apartment
host_is_superhost                          1
host_total_listings_count                  1
accommodates                               2
bedrooms                                   0
bathrooms                                  1
minimum_nights                             5
maximum_nights                            29
security_deposit                         140
cleaning_fee                              80
coffee_machine                             1
outdoor_space                              0
Name: 2578, dtype: object

#### Apply NearestNeighbors Algorithm to find the similar listings

In [17]:
# Preprocess
num_col = X.select_dtypes(include=['int64', 'float64']).columns
cat_col = X.select_dtypes(include=['object']).columns

num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scalar', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_transformer, num_col),
    ('cat', cat_transformer, cat_col)
])

In [18]:
# Knn Pipeline
nn = Pipeline([
    ('preprocessor', preprocessor),
    ('knn', NearestNeighbors())
])

In [19]:
nn.fit(X)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scalar',
                                                                   StandardScaler())]),
                                                  Index(['price', 'instant_bookable', 'host_is_superhost',
       'host_total_listings_count', 'accommodates', 'bedrooms', 'bathrooms',
       'minimum_nights', 'maximum_nights', 'security_deposit', 'cleaning_fee',
       'coffee_machine', 'outdoor_space'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='unknown

In [38]:
with open('static/similar_listing.pickle', 'wb') as f:
    pickle.dump(nn, f)

In [46]:
X.loc[1578]

price                                    145
borough                              Fremont
instant_bookable                           1
room_type                    Entire home/apt
property_type                          House
host_is_superhost                          1
host_total_listings_count                  4
accommodates                               4
bedrooms                                   1
bathrooms                                  2
minimum_nights                             2
maximum_nights                          1125
security_deposit                         300
cleaning_fee                              70
coffee_machine                             1
outdoor_space                              1
Name: 1578, dtype: object

In [29]:
test_T = preprocessor.transform(X.loc[[1578]])

In [35]:
test = nn['preprocessor'].transform(X.loc[[1578]])

In [36]:
recs = nn['knn'].kneighbors(test, return_distance = False)

In [37]:
recs

array([[1548, 1973, 1525, 4139,  847]])

In [45]:
rec_df = pd.DataFrame(columns = df.columns.tolist())
for i in recs:
    rec_df = rec_df.append(df.iloc[i], ignore_index=True)

list(rec_df.listing_url.values)

['https://www.airbnb.com/rooms/12473960',
 'https://www.airbnb.com/rooms/14917821',
 'https://www.airbnb.com/rooms/12249990',
 'https://www.airbnb.com/rooms/28515804',
 'https://www.airbnb.com/rooms/6937492']

In [26]:
X.loc[[1578]]

Unnamed: 0,price,borough,instant_bookable,room_type,property_type,host_is_superhost,host_total_listings_count,accommodates,bedrooms,bathrooms,minimum_nights,maximum_nights,security_deposit,cleaning_fee,coffee_machine,outdoor_space
1578,145.0,Fremont,1,Entire home/apt,House,1,4.0,4,1.0,2.0,2,1125,300.0,70.0,1.0,1.0
