In [2]:
import numpy as np
import pandas as pd
import itertools as it
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import scipy as sp
from itertools import combinations
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
%matplotlib inline

In [3]:
#read the data
df = pd.read_csv('listings_clean.csv')
print(df.columns.values)
df.head()

['id' 'host_since' 'zipcode' 'latitude' 'longitude' 'property_type'
 'room_type' 'accommodates' 'bathrooms' 'bedrooms' 'beds' 'bed_type'
 'guests_included' 'minimum_nights' 'maximum_nights' 'availability_30'
 'availability_60' 'availability_90' 'availability_365' 'number_of_reviews'
 'first_review' 'last_review' 'review_scores_rating'
 'review_scores_accuracy' 'review_scores_cleanliness'
 'review_scores_checkin' 'review_scores_communication'
 'review_scores_location' 'review_scores_value' 'host_listing_count'
 'price']


Unnamed: 0,id,host_since,zipcode,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_listing_count,price
0,1069266,1926,10022.0,40.756852,-73.964754,0,0,2,1.0,1.0,...,2542.0,86.0,9.0,7.0,9.0,9.0,10.0,9.0,1,160
1,1846722,1625,,40.830599,-73.941014,0,0,10,1.0,3.0,...,2554.0,85.0,8.0,8.0,9.0,8.0,7.0,8.0,2,105
2,2061725,1831,11221.0,40.692189,-73.92412,0,1,2,1.0,1.0,...,2554.0,98.0,10.0,10.0,10.0,10.0,9.0,10.0,4,58
3,44974,953,10011.0,40.734751,-74.002592,0,0,2,1.0,1.0,...,2494.0,96.0,10.0,9.0,10.0,10.0,10.0,9.0,1,185
4,4701675,2479,10011.0,40.745282,-73.997836,0,0,2,1.0,1.0,...,2533.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,1,195


In [4]:
df.shape

(27392, 31)

### Remove 5% samples from smallest zipcode clusters ###

In [5]:
# remove small zipcode clusters, with 95% data left
# count each zipcode entries
from collections import Counter
nb_counts = Counter(df['zipcode'])
tdf = pd.DataFrame.from_dict(nb_counts, orient='index').sort_values(by=0)
# select clusters >= 50
tdf1 = tdf[tdf.values>=50]
print 'Remaing data proportion: ' + str(float(sum(tdf1.values))/df.shape[0])

zipcode_included = tdf1.index

df1 = df.loc[df['zipcode'].isin(zipcode_included)]
print df1.shape

Remaing data proportion: 0.94976635514
(26016, 31)


In [6]:
y = df1['price']

In [7]:
#Feature Extraction:
#Use the features we decided last time
features = ['zipcode', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 
            'bathrooms', 'bedrooms', 'beds', 'bed_type', 'number_of_reviews', 'review_scores_rating',
            'review_scores_accuracy', 'review_scores_cleanliness', 
            'review_scores_checkin', 'review_scores_communication', 
            'review_scores_location', 'review_scores_value', 'host_listing_count']

df_x = df1[features]
df_x.head()

Unnamed: 0,zipcode,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_listing_count
0,10022.0,40.756852,-73.964754,0,0,2,1.0,1.0,1.0,0,62,86.0,9.0,7.0,9.0,9.0,10.0,9.0,1
2,11221.0,40.692189,-73.92412,0,1,2,1.0,1.0,2.0,0,35,98.0,10.0,10.0,10.0,10.0,9.0,10.0,4
3,10011.0,40.734751,-74.002592,0,0,2,1.0,1.0,1.0,0,26,96.0,10.0,9.0,10.0,10.0,10.0,9.0,1
4,10011.0,40.745282,-73.997836,0,0,2,1.0,1.0,2.0,0,1,100.0,10.0,10.0,10.0,10.0,10.0,10.0,1
5,11231.0,40.67906,-73.99473,0,0,6,1.0,2.0,3.0,0,16,96.0,10.0,9.0,10.0,9.0,10.0,9.0,2


Now we start imputing missing values. We fill in numerical values with the mean of its columns, fill in categorical values with most common value of its column.

In [8]:
#categorical and numerical column names
cate = ['zipcode', 'property_type', 'room_type', 'bed_type']
nume = [c for c in df_x.columns.values if c not in cate]

#fill categorical with mode, numerical with mean
df_x_cate = df_x[cate].apply(lambda x:x.fillna(x.value_counts().index[0]))
df_x_nume = df_x[nume].apply(lambda x:x.fillna(x.mean()))

df_x = pd.concat([df_x_cate, df_x_nume], axis=1)
df_x.head()

Unnamed: 0,zipcode,property_type,room_type,bed_type,latitude,longitude,accommodates,bathrooms,bedrooms,beds,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_listing_count
0,10022.0,0,0,0,40.756852,-73.964754,2,1.0,1.0,1.0,62,86.0,9.0,7.0,9.0,9.0,10.0,9.0,1
2,11221.0,0,1,0,40.692189,-73.92412,2,1.0,1.0,2.0,35,98.0,10.0,10.0,10.0,10.0,9.0,10.0,4
3,10011.0,0,0,0,40.734751,-74.002592,2,1.0,1.0,1.0,26,96.0,10.0,9.0,10.0,10.0,10.0,9.0,1
4,10011.0,0,0,0,40.745282,-73.997836,2,1.0,1.0,2.0,1,100.0,10.0,10.0,10.0,10.0,10.0,10.0,1
5,11231.0,0,0,0,40.67906,-73.99473,6,1.0,2.0,3.0,16,96.0,10.0,9.0,10.0,9.0,10.0,9.0,2


Now we encode categorical variables.

In [9]:
# Get numpy array from data
x = df_x.values

# Apply one hot endcoing
categorical = [(c in cate) for c in df_x.columns]
encoder = preprocessing.OneHotEncoder(categorical_features=categorical, sparse=False)  

x = encoder.fit_transform(x)
x

array([[  0.,   0.,   0., ...,  10.,   9.,   1.],
       [  0.,   0.,   0., ...,   9.,  10.,   4.],
       [  0.,   0.,   0., ...,  10.,   9.,   1.],
       ..., 
       [  0.,   0.,   0., ...,   9.,   8.,  15.],
       [  0.,   0.,   0., ...,   9.,  10.,   7.],
       [  0.,   0.,   0., ...,  10.,  10.,   2.]])

In [10]:
print x.shape
print y.shape

(26016, 103)
(26016,)


Now we've already preprocessed data in both files. We could start build models. We split our training dataset into train data and test data.

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [12]:
print 'train data: ', x_train.shape
print 'test data: ', x_test.shape

train data:  (18211, 103)
test data:  (7805, 103)


In [14]:
#OLS
import statsmodels.api as sm
X = sm.add_constant(x_train)
model = sm.OLS(y_train,X)
results = model.fit()
print results.summary()

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.284
Model:                            OLS   Adj. R-squared:                  0.280
Method:                 Least Squares   F-statistic:                     72.55
Date:                Sun, 27 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:46:42   Log-Likelihood:            -1.2227e+05
No. Observations:               18211   AIC:                         2.447e+05
Df Residuals:                   18111   BIC:                         2.455e+05
Df Model:                          99                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const      -9.753e+04   1.55e+04     -6.298      0.0

R square is 0.284.  There are some strong multicollinearity problems.

In [15]:
#same OLS model with sklearn (easier to score but less detailed)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)
print "Train Score", model.score(x_train, y_train)
print "Test Score", model.score(x_test, y_test)

Train Score 0.283973128565
Test Score 0.312829854813


Interestingly, for this split the test set does better than the training set.

In [18]:
#try several random seeds to check for stability
for r in range(10):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=r)
    model = LinearRegression()
    model.fit(x_train, y_train)
    print "Train Score", model.score(x_train, y_train)
    print "Test Score", model.score(x_test, y_test)

Train Score 0.283973128565
Test Score 0.312829854813
Train Score 0.281662524074
Test Score 0.316130954042
Train Score 0.287655016733
Test Score 0.302602414551
Train Score 0.288089444533
Test Score 0.301531110812
Train Score 0.283100731778
Test Score 0.315763502854
Train Score 0.302153298049
Test Score 0.251528628196
Train Score 0.298675493312
Test Score 0.272780227842
Train Score 0.279662892491
Test Score 0.315626690708
Train Score 0.287012841804
Test Score 0.298952906367
Train Score 0.298008258421
Test Score 0.272040638302


In [19]:
model.coef_

array([  5.39738681e+01,   5.35075285e+00,   3.36541787e+01,
        -3.49029070e+01,   2.95913768e+01,   2.48515860e+01,
         2.56489842e+00,   5.88636554e+01,   3.25314651e+01,
         3.47834719e+01,   7.47707757e+01,   5.58380882e+01,
         8.03189467e+01,   3.59852522e+01,  -1.90212978e+01,
        -2.85968708e+00,   3.90742803e+01,   7.45623442e+01,
        -2.06912957e+01,  -2.74185000e+01,  -6.03512235e+01,
        -8.74780246e+01,  -1.00920572e+02,   5.96497348e+00,
        -6.96207419e+01,  -1.01828481e+02,  -1.23478135e+02,
        -1.22904760e+02,  -1.23417003e+02,  -1.48551276e+02,
        -6.90626552e+01,   2.36918025e+01,  -6.80137758e+01,
         2.50283521e+01,  -1.06078616e+02,  -1.30710888e+02,
         2.53771418e+01,   2.42304776e+00,  -5.67575263e-01,
         1.84465174e+02,  -7.86580617e+01,  -8.75917713e+00,
        -1.79879989e+01,   3.57063175e+00,   3.11305752e+00,
        -2.19057101e+01,  -1.40534587e+01,   3.00617791e+01,
         3.79003915e+01,

We check that the coefficients are reasonable (similar in magnitude).  Before we discarded the zipcodes with only a small number of data points, we observed high variance and numerical instability in the coefficents.