In [1]:
import numpy as np
import pandas as pd
import itertools as it
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import scipy as sp
import statsmodels.api as sm
from itertools import combinations
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from collections import Counter
from sklearn import ensemble
from sklearn.cross_validation import KFold
from sklearn import svm
%matplotlib inline

In [2]:
#read the data
df = pd.read_csv('listings_clean.csv')
print(df.columns.values)
df.head()

['id' 'host_since' 'zipcode' 'latitude' 'longitude' 'property_type'
 'room_type' 'accommodates' 'bathrooms' 'bedrooms' 'beds' 'bed_type'
 'guests_included' 'minimum_nights' 'maximum_nights' 'availability_30'
 'availability_60' 'availability_90' 'availability_365' 'number_of_reviews'
 'first_review' 'last_review' 'review_scores_rating'
 'review_scores_accuracy' 'review_scores_cleanliness'
 'review_scores_checkin' 'review_scores_communication'
 'review_scores_location' 'review_scores_value' 'host_listing_count'
 'price']


Unnamed: 0,id,host_since,zipcode,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_listing_count,price
0,1069266,1926,10022.0,40.756852,-73.964754,0,0,2,1.0,1.0,...,2542.0,86.0,9.0,7.0,9.0,9.0,10.0,9.0,1,160
1,1846722,1625,,40.830599,-73.941014,0,0,10,1.0,3.0,...,2554.0,85.0,8.0,8.0,9.0,8.0,7.0,8.0,2,105
2,2061725,1831,11221.0,40.692189,-73.92412,0,1,2,1.0,1.0,...,2554.0,98.0,10.0,10.0,10.0,10.0,9.0,10.0,4,58
3,44974,953,10011.0,40.734751,-74.002592,0,0,2,1.0,1.0,...,2494.0,96.0,10.0,9.0,10.0,10.0,10.0,9.0,1,185
4,4701675,2479,10011.0,40.745282,-73.997836,0,0,2,1.0,1.0,...,2533.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,1,195


In [3]:
df.shape

(27392, 31)

### Remove 5% samples from smallest zipcode clusters ###

In [4]:
# remove small zipcode clusters, with 95% data left
# count each zipcode entries
nb_counts = Counter(df['zipcode'])
tdf = pd.DataFrame.from_dict(nb_counts, orient='index').sort_values(by=0)
# select clusters with size >= 50
tdf1 = tdf[tdf.values>=50]
print 'Remaining data proportion: ' + str(float(sum(tdf1.values))/df.shape[0])
# get zipcode clusters
zipcode_included = tdf1.index

df1 = df.loc[df['zipcode'].isin(zipcode_included)]
print df1.shape

Remaining data proportion: 0.94976635514
(26016, 31)


In [5]:
y = df1['price']

In [6]:
#Feature Extraction:
#Use the features we decided last time
features = ['zipcode', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 
            'bathrooms', 'bedrooms', 'beds', 'bed_type', 'number_of_reviews', 'review_scores_rating',
            'review_scores_accuracy', 'review_scores_cleanliness', 
            'review_scores_checkin', 'review_scores_communication', 
            'review_scores_location', 'review_scores_value', 'host_listing_count']

df_x = df1[features]
df_x.head()

Unnamed: 0,zipcode,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_listing_count
0,10022.0,40.756852,-73.964754,0,0,2,1.0,1.0,1.0,0,62,86.0,9.0,7.0,9.0,9.0,10.0,9.0,1
2,11221.0,40.692189,-73.92412,0,1,2,1.0,1.0,2.0,0,35,98.0,10.0,10.0,10.0,10.0,9.0,10.0,4
3,10011.0,40.734751,-74.002592,0,0,2,1.0,1.0,1.0,0,26,96.0,10.0,9.0,10.0,10.0,10.0,9.0,1
4,10011.0,40.745282,-73.997836,0,0,2,1.0,1.0,2.0,0,1,100.0,10.0,10.0,10.0,10.0,10.0,10.0,1
5,11231.0,40.67906,-73.99473,0,0,6,1.0,2.0,3.0,0,16,96.0,10.0,9.0,10.0,9.0,10.0,9.0,2


Now we start imputing missing values. We fill in numerical values with the mean of its columns, fill in categorical values with most common value of its column.

In [7]:
#categorical and numerical column names
cate = ['zipcode', 'property_type', 'room_type', 'bed_type']
nume = [c for c in df_x.columns.values if c not in cate]

#fill categorical with mode, numerical with mean
df_x_cate = df_x[cate].apply(lambda x:x.fillna(x.value_counts().index[0]))
df_x_nume = df_x[nume].apply(lambda x:x.fillna(x.mean()))

df_x = pd.concat([df_x_cate, df_x_nume], axis=1)
df_x.head()

Unnamed: 0,zipcode,property_type,room_type,bed_type,latitude,longitude,accommodates,bathrooms,bedrooms,beds,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_listing_count
0,10022.0,0,0,0,40.756852,-73.964754,2,1.0,1.0,1.0,62,86.0,9.0,7.0,9.0,9.0,10.0,9.0,1
2,11221.0,0,1,0,40.692189,-73.92412,2,1.0,1.0,2.0,35,98.0,10.0,10.0,10.0,10.0,9.0,10.0,4
3,10011.0,0,0,0,40.734751,-74.002592,2,1.0,1.0,1.0,26,96.0,10.0,9.0,10.0,10.0,10.0,9.0,1
4,10011.0,0,0,0,40.745282,-73.997836,2,1.0,1.0,2.0,1,100.0,10.0,10.0,10.0,10.0,10.0,10.0,1
5,11231.0,0,0,0,40.67906,-73.99473,6,1.0,2.0,3.0,16,96.0,10.0,9.0,10.0,9.0,10.0,9.0,2


Now we encode categorical variables.

In [8]:
# Get numpy array from data
x = df_x.values

# Apply one hot endcoing
categorical = [(c in cate) for c in df_x.columns]
encoder = preprocessing.OneHotEncoder(categorical_features=categorical, sparse=False)  

x = encoder.fit_transform(x)
x

array([[  0.,   0.,   0., ...,  10.,   9.,   1.],
       [  0.,   0.,   0., ...,   9.,  10.,   4.],
       [  0.,   0.,   0., ...,  10.,   9.,   1.],
       ..., 
       [  0.,   0.,   0., ...,   9.,   8.,  15.],
       [  0.,   0.,   0., ...,   9.,  10.,   7.],
       [  0.,   0.,   0., ...,  10.,  10.,   2.]])

In [9]:
print x.shape
print y.shape

(26016, 103)
(26016,)


Now we've already preprocessed data in both files. We could start build models. We split our training dataset into train data and test data.

Now let's try random forest model.
Three important parameters of `sklearn`'s Random Forest module that influence the model fit are the *number of trees*, `n_estimators`, the number of predictors to consider for each split, `max_features`, and the maximum depth of the trees, `max_depth`. Below, we'll tune all the these parameters, using 5-fold cross-validation.

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
y_train = y_train.values
y_test = y_test.values

In [None]:
import sys
# Parameters for tuning
n_trees = np.arange(10, 100, 10) # Trees and depth are explored on an exponentially growing space,
depths = np.arange(2, 10)   # since it is assumed that trees and depth will add accuracy in a decaying fashion.
n_features = np.arange(10, 55, 5)
# To keep track of the best model
best_score = 0

# Run grid search for model with 5-fold cross validation
#print '5-fold cross validation:'

num_loops = len(n_trees) * len(depths) * len(n_features)
count=0

for trees in n_trees:
    for depth in depths:
        for features in n_features:
        
            # Cross validation for every experiment
            k_folds = KFold(x_train.shape[0], n_folds=5, shuffle=True)
            scores = []
            for train_indices, validation_indices in k_folds:

                # Generate training data
                x_train_cv = x_train[train_indices]
                y_train_cv = y_train[train_indices]
                # Generate validation data
                x_validate = x_train[validation_indices]
                y_validate = y_train[validation_indices]

                # Fit random forest on training data
                model = ensemble.RandomForestRegressor(n_estimators=trees, max_depth=depth, max_features=features)
                model.fit(x_train_cv, y_train_cv)
                # Score on validation data
                scores += [model.score(x_validate, y_validate)]

            # Record and report accuracy
            average_score = np.mean(scores)
            #print "Trees:", trees, "Depth:", depth, "max_features", features, "Score:", average_score
            
            #Print progress
            count=count+1
            sys.stdout.write("\r{0}% completed".format((float(count)/num_loops)*100))
            sys.stdout.flush()
            # Update our record of the best parameters see so far
            if average_score > best_score:
                best_score = average_score
                best_trees = trees
                best_depth = depth
                best_features = features


(Note: Sorry that I accidently deleted the output showing the progress. It takes around an hour to tune in total.)

In [34]:
# Fit model on entire train set using chosen number of trees and depth
model = ensemble.RandomForestRegressor(n_estimators=best_trees, max_depth=best_depth, max_features=best_features )
model.fit(x_train, y_train)
print 'Chosen number of trees, depth:', best_trees, ',', best_depth, ',', best_features
print 'Test accuracy:', model.score(x_test, y_test)

Chosen number of trees, depth: 70 , 8 , 35
Test accuracy: 0.375879343643


In [28]:
for r in range(10):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=r)
    y_train = y_train.values
    y_test = y_test.values
    model = ensemble.RandomForestRegressor(n_estimators=best_trees, max_depth=best_depth, max_features=best_features)
    model.fit(x_train, y_train)
    print "Train {:.3f} Test {:.3f}".format(model.score(x_train, y_train), model.score(x_test, y_test))

Train 0.602 Test 0.376
Train 0.599 Test 0.364
Train 0.619 Test 0.363
Train 0.610 Test 0.308
Train 0.625 Test 0.379
Train 0.634 Test 0.329
Train 0.616 Test 0.346
Train 0.601 Test 0.379
Train 0.605 Test 0.396
Train 0.613 Test 0.336


It seems that random forest model can help to improve our prediction accuracy. Recall that the score of our previous OLS model is 0.30. Using tuned random forest model, the score could be improved to around 0.37.

In [10]:
#SVM
from math import pow

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)


model = svm.SVC()
model.fit(x_train, y_train)
print "Train {:.3f} Test {:.3f}".format(model.score(x_train, y_train), model.score(x_test, y_test))

Train 0.093 Test 0.078


I also tried SVM model. The above SVM model(without tuning) has already takes several minutes. I tried tuing C with "linear" kernel, and the result is just as poor as above. Therefore, it seems that running SVM models would take very long time, and the result is not as good as Random Forest Model. So I decided not to go ahead with SVM model.