### Data Prep

In [4]:
# Load the data from the Apporto machine to the Colab environment

from google.colab import files
uploaded = files.upload()

Saving listings.csv to listings.csv


In [5]:
# Pandas is the Python package for data frames

import pandas as pd

In [6]:
# Part 1 Data Acquisition

# Read data from a CSV file into a data frame

df = pd.read_csv('listings.csv')

# Display the first ten rows of the data frame to examine if it is an individual-level data set
print(df.head(10))

# Display the variable list
print(df.columns.values)

# Display the number of rows and the number of columns in the data set to confirm the portrait shape
# The first element of the output is the number of rows and the second is the number of columns 
print(df.shape)

     id  ... reviews_per_month
0  2595  ...              0.37
1  3831  ...              4.82
2  5121  ...              0.36
3  5136  ...              0.01
4  5178  ...              3.42
5  5203  ...              0.88
6  5238  ...              1.20
7  5552  ...              0.49
8  5803  ...              1.30
9  6021  ...              0.89

[10 rows x 74 columns]
['id' 'listing_url' 'scrape_id' 'last_scraped' 'name' 'description'
 'neighborhood_overview' 'picture_url' 'host_id' 'host_url' 'host_name'
 'host_since' 'host_location' 'host_about' 'host_response_time'
 'host_response_rate' 'host_acceptance_rate' 'host_is_superhost'
 'host_thumbnail_url' 'host_picture_url' 'host_neighbourhood'
 'host_listings_count' 'host_total_listings_count' 'host_verifications'
 'host_has_profile_pic' 'host_identity_verified' 'neighbourhood'
 'neighbourhood_cleansed' 'neighbourhood_group_cleansed' 'latitude'
 'longitude' 'property_type' 'room_type' 'accommodates' 'bathrooms'
 'bathrooms_text' 'bedrooms' 'b

In [7]:
# Dropping all redundant variables. Variables we will not be considering in our model
rvar_list = ['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name',
 'host_since', 'host_location', 'host_about', 'host_response_time',
 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
  'host_total_listings_count', 'host_verifications',
 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
 'neighbourhood_cleansed',  'latitude',
 'longitude', 'property_type', 'accommodates', 'bathrooms',
 'bathrooms_text',  'amenities', 'minimum_nights',
 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
 'minimum_maximum_nights', 'maximum_maximum_nights',
 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated',
 'has_availability', 'availability_60', 'availability_90',
 'availability_365', 'calendar_last_scraped', 
 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
 'last_review', 'review_scores_accuracy',
 'review_scores_cleanliness', 'review_scores_checkin',
 'review_scores_communication', 'review_scores_location',
 'review_scores_value', 'license', 
 'calculated_host_listings_count',
 'calculated_host_listings_count_entire_homes',
 'calculated_host_listings_count_private_rooms',
 'calculated_host_listings_count_shared_rooms']
df1 = df.copy()
df1 = df.drop(columns=rvar_list)

# Setting up variable types
cat_vars = ['neighbourhood_group_cleansed', 'room_type', 'instant_bookable']
num_vars = ['bedrooms', 'review_scores_rating', 'beds', 'host_listings_count', 'number_of_reviews', 'availability_30', 'reviews_per_month', 'price']

# Checking for and dealing with nulls
print(df1.isnull().sum())

df2 = df1.copy()
df2['host_listings_count'] = df1['host_listings_count'].fillna(value = df1['host_listings_count'].median())
df2['bedrooms'] = df1['bedrooms'].fillna(value = df1['bedrooms'].median())
df2['beds'] = df1['beds'].fillna(value = df1['beds'].median())
df2['review_scores_rating'] = df1['review_scores_rating'].fillna(value = df1['review_scores_rating'].median())
df2['reviews_per_month'] = df1['reviews_per_month'].fillna(value = df1['reviews_per_month'].median())

print(df2.isnull().sum())

host_listings_count                14
neighbourhood_group_cleansed        0
room_type                           0
bedrooms                         4407
beds                              505
price                               0
availability_30                     0
number_of_reviews                   0
review_scores_rating            11702
instant_bookable                    0
reviews_per_month               10633
dtype: int64
host_listings_count             0
neighbourhood_group_cleansed    0
room_type                       0
bedrooms                        0
beds                            0
price                           0
availability_30                 0
number_of_reviews               0
review_scores_rating            0
instant_bookable                0
reviews_per_month               0
dtype: int64


In [25]:
# Standardization
df3 = df2.copy()
df3[num_vars] = (df2[num_vars] - df2[num_vars].mean())/df2[num_vars].std()

# There's some error with availability_30, where it won't properly standardize unless it gets its own line for some reason
df3['availability_30'] = (df2['availability_30'] - df2['availability_30'].mean())/df2['availability_30'].std()

# Setting data types
df4 = df3.copy()
df4[cat_vars] = df3[cat_vars].astype('category')
df4[num_vars] = df3[num_vars].astype('float64')

# Making dummies
df5 = df4.copy()
df5 = pd.get_dummies(df4, prefix_sep='_')

print(df5.columns.values)

['host_listings_count' 'bedrooms' 'beds' 'price' 'availability_30'
 'number_of_reviews' 'review_scores_rating' 'reviews_per_month'
 'neighbourhood_group_cleansed_Bronx'
 'neighbourhood_group_cleansed_Brooklyn'
 'neighbourhood_group_cleansed_Manhattan'
 'neighbourhood_group_cleansed_Queens'
 'neighbourhood_group_cleansed_Staten Island' 'room_type_Entire home/apt'
 'room_type_Hotel room' 'room_type_Private room' 'room_type_Shared room'
 'instant_bookable_f' 'instant_bookable_t']


In [26]:
# Checking most common dummies
df5.mode()

Unnamed: 0,host_listings_count,bedrooms,beds,price,availability_30,number_of_reviews,review_scores_rating,reviews_per_month,neighbourhood_group_cleansed_Bronx,neighbourhood_group_cleansed_Brooklyn,neighbourhood_group_cleansed_Manhattan,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,instant_bookable_f,instant_bookable_t
0,-0.122368,-0.774621,-0.461279,-0.354295,-0.774621,-0.485252,-0.403734,0.284756,0,0,0,0,0,1,0,0,0,1,0


In [27]:
# Removing redundant dummies
rdummies = ['instant_bookable_f', 'room_type_Entire home/apt']
df6 = df5.copy()
df6 = df5.drop(columns=rdummies)

print(df6.columns.values)
print(df6)
## df6.to_csv('AirbnbCleaned.csv',index=False) #I added this since we need to generate the csv as well. Feel free to fix it if it's coded wrongly lol -Gunita

['host_listings_count' 'bedrooms' 'beds' 'price' 'availability_30'
 'number_of_reviews' 'review_scores_rating' 'reviews_per_month'
 'neighbourhood_group_cleansed_Bronx'
 'neighbourhood_group_cleansed_Brooklyn'
 'neighbourhood_group_cleansed_Manhattan'
 'neighbourhood_group_cleansed_Queens'
 'neighbourhood_group_cleansed_Staten Island' 'room_type_Hotel room'
 'room_type_Private room' 'room_type_Shared room' 'instant_bookable_t']
       host_listings_count  bedrooms  ...  room_type_Shared room  instant_bookable_t
0                -0.080487  1.583922  ...                      0                   0
1                -0.122368  0.247414  ...                      0                   0
2                -0.122368  1.583922  ...                      0                   0
3                -0.122368  0.561887  ...                      0                   0
4                -0.122368  1.190831  ...                      0                   0
...                    ...       ...  ...                 

### Data Modeling with Nearest Neighbor

In [28]:
# Part 5 Data Partiton

# Required package: scikit-learn. Package name in Python: sklearn
# Required subpackage: model_selection. Required function name: train_test_split
from sklearn.model_selection import train_test_split

# Placeholder variables: df4partition, testpart_size
# test_size specifies the percentage for the test partition

testpart_size = 0.2

# random_state specifies the seed for random number generator. 
# random_state = 1 unless otherwised noted
df_nontestData, df_testData = train_test_split(df6, test_size=testpart_size, random_state=1)

print(df_nontestData)

       host_listings_count  bedrooms  ...  room_type_Shared room  instant_bookable_t
10979            -0.122368 -0.774621  ...                      0                   0
3243             -0.072111  1.505304  ...                      0                   0
20881            -0.113992 -0.774621  ...                      0                   0
43152            -0.130744 -0.774621  ...                      0                   0
31224            -0.122368  1.348067  ...                      0                   0
...                    ...       ...  ...                    ...                 ...
43723            -0.130744  1.583922  ...                      0                   1
32511            -0.130744  1.583922  ...                      0                   1
5192             -0.113992 -0.774621  ...                      0                   1
12172            -0.122368 -0.774621  ...                      0                   0
33003            -0.113992 -0.145676  ...                      0 

In [29]:
# Part 6 Nearest neighbor 

# Separate the predictor values and the DV values into X and y respectively
# Placeholder variable: DV
DV = 'price'
y = df_nontestData[DV]
X = df_nontestData.drop(columns=[DV])

# First we build a kNN model with pre-specified k
k = 5


In [30]:
from sklearn.neighbors import KNeighborsRegressor

clf = KNeighborsRegressor(metric='euclidean', n_neighbors=k).fit(X,y)

# Report the performance over the test partition

X_test = df_testData.drop(columns=DV)
y_test = df_testData[DV]

from sklearn import metrics

model_object = clf

# obtain the explained variance of the k-NN model with k=5
print(metrics.explained_variance_score(y_test, model_object.predict(X_test)))

0.5742006729508411


In [34]:
# Run Nearest neighbor with k-fold cross validation
# Placeholder variable: kfolds
kfolds = 5 #for cross validation

# We first define the search scope. k (k-th nearest neighbor) will be increased from 1 to max_k
max_k = 200

param_grid = {'n_neighbors': list(range(1, max_k+1))}

from sklearn.model_selection import GridSearchCV
import time

gridsearch = GridSearchCV(KNeighborsRegressor(metric='euclidean'), param_grid, scoring='explained_variance', cv=kfolds, n_jobs=-1)

In [35]:
#calculate the time it takes to get the results
start = time.time()
gridsearch.fit(X,y)
end = time.time() - start

In [36]:
end #spent 3660 seconds to get the result

3346.9287893772125

In [37]:
#use the function to calculate the optimal value
clf_bestkNN = gridsearch.best_estimator_

In [40]:
# Display the optimal k (k-th nearest neighbor)
print(clf_bestkNN.n_neighbors)

# y_test_actual is the actual values of the DV in the test partition
y_test_actual = df_testData[DV]

# X_test is the predictor values in the test partition
X_test = df_testData.drop(columns=[DV])


# Get the explained variance of the final selected k-NN model
from sklearn.metrics import mean_squared_error
print(metrics.explained_variance_score(y_test_actual, clf_bestkNN.predict(X_test)))

16
0.6068180306049795
