In [2]:
import numpy as np
import pandas as pd
import itertools as it
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import scipy as sp
from itertools import combinations
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
%matplotlib inline

In [3]:
#read the data
df = pd.read_csv('listings_clean.csv')
print(df.columns.values)
df.head()

['id' 'host_since' 'zipcode' 'latitude' 'longitude' 'property_type'
 'room_type' 'accommodates' 'bathrooms' 'bedrooms' 'beds' 'bed_type'
 'guests_included' 'minimum_nights' 'maximum_nights' 'availability_30'
 'availability_60' 'availability_90' 'availability_365' 'number_of_reviews'
 'first_review' 'last_review' 'review_scores_rating'
 'review_scores_accuracy' 'review_scores_cleanliness'
 'review_scores_checkin' 'review_scores_communication'
 'review_scores_location' 'review_scores_value' 'host_listing_count'
 'price']


Unnamed: 0,id,host_since,zipcode,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_listing_count,price
0,1069266,1926,10022.0,40.756852,-73.964754,0,0,2,1.0,1.0,...,2542.0,86.0,9.0,7.0,9.0,9.0,10.0,9.0,1,160
1,1846722,1625,,40.830599,-73.941014,0,0,10,1.0,3.0,...,2554.0,85.0,8.0,8.0,9.0,8.0,7.0,8.0,2,105
2,2061725,1831,11221.0,40.692189,-73.92412,0,1,2,1.0,1.0,...,2554.0,98.0,10.0,10.0,10.0,10.0,9.0,10.0,4,58
3,44974,953,10011.0,40.734751,-74.002592,0,0,2,1.0,1.0,...,2494.0,96.0,10.0,9.0,10.0,10.0,10.0,9.0,1,185
4,4701675,2479,10011.0,40.745282,-73.997836,0,0,2,1.0,1.0,...,2533.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,1,195


In [4]:
y = df['price']

In [5]:
#Feature Extraction:
#Use the features we decided last time
features = ['zipcode', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 
            'bathrooms', 'bedrooms', 'beds', 'bed_type', 'number_of_reviews', 'review_scores_rating',
            'review_scores_accuracy', 'review_scores_cleanliness', 
            'review_scores_checkin', 'review_scores_communication', 
            'review_scores_location', 'review_scores_value', 'host_listing_count']

df_x = df[features]
df_x.head()

Unnamed: 0,zipcode,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_listing_count
0,10022.0,40.756852,-73.964754,0,0,2,1.0,1.0,1.0,0,62,86.0,9.0,7.0,9.0,9.0,10.0,9.0,1
1,,40.830599,-73.941014,0,0,10,1.0,3.0,3.0,0,22,85.0,8.0,8.0,9.0,8.0,7.0,8.0,2
2,11221.0,40.692189,-73.92412,0,1,2,1.0,1.0,2.0,0,35,98.0,10.0,10.0,10.0,10.0,9.0,10.0,4
3,10011.0,40.734751,-74.002592,0,0,2,1.0,1.0,1.0,0,26,96.0,10.0,9.0,10.0,10.0,10.0,9.0,1
4,10011.0,40.745282,-73.997836,0,0,2,1.0,1.0,2.0,0,1,100.0,10.0,10.0,10.0,10.0,10.0,10.0,1


Now we start imputing missing values. We fill in numerical values with the mean of its columns, fill in categorical values with most common value of its column.

In [6]:
#categorical and numerical column names
cate = ['zipcode', 'property_type', 'room_type', 'bed_type']
nume = [c for c in df_x.columns.values if c not in cate]

#fill categorical with mode, numerical with mean
df_x_cate = df_x[cate].apply(lambda x:x.fillna(x.value_counts().index[0]))
df_x_nume = df_x[nume].apply(lambda x:x.fillna(x.mean()))

df_x = pd.concat([df_x_cate, df_x_nume], axis=1)
df_x.head()

Unnamed: 0,zipcode,property_type,room_type,bed_type,latitude,longitude,accommodates,bathrooms,bedrooms,beds,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_listing_count
0,10022.0,0,0,0,40.756852,-73.964754,2,1.0,1.0,1.0,62,86.0,9.0,7.0,9.0,9.0,10.0,9.0,1
1,11211.0,0,0,0,40.830599,-73.941014,10,1.0,3.0,3.0,22,85.0,8.0,8.0,9.0,8.0,7.0,8.0,2
2,11221.0,0,1,0,40.692189,-73.92412,2,1.0,1.0,2.0,35,98.0,10.0,10.0,10.0,10.0,9.0,10.0,4
3,10011.0,0,0,0,40.734751,-74.002592,2,1.0,1.0,1.0,26,96.0,10.0,9.0,10.0,10.0,10.0,9.0,1
4,10011.0,0,0,0,40.745282,-73.997836,2,1.0,1.0,2.0,1,100.0,10.0,10.0,10.0,10.0,10.0,10.0,1


Now we encode categorical variables.

In [7]:
# Get numpy array from data
x = df_x.values

# Apply one hot endcoing
categorical = [(c in cate) for c in df_x.columns]
encoder = preprocessing.OneHotEncoder(categorical_features=categorical, sparse=False)  

x = encoder.fit_transform(x)
x

array([[  0.,   0.,   0., ...,  10.,   9.,   1.],
       [  0.,   0.,   0., ...,   7.,   8.,   2.],
       [  0.,   0.,   0., ...,   9.,  10.,   4.],
       ..., 
       [  0.,   0.,   0., ...,   9.,   8.,  15.],
       [  0.,   0.,   0., ...,   9.,  10.,   7.],
       [  0.,   0.,   0., ...,  10.,  10.,   2.]])

In [8]:
print x.shape
print y.shape

(27392, 210)
(27392,)


Now we've already preprocessed data in both files. We could start build models. We split our training dataset into train data and test data.

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [23]:
print 'train data: ', x_train.shape
print 'test data: ', x_test.shape

train data:  (19174, 210)
test data:  (8218, 210)


In [24]:
#OLS
import statsmodels.api as sm
X = sm.add_constant(x_train)
model = sm.OLS(y_train,X)
results = model.fit()
print results.summary()

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.277
Model:                            OLS   Adj. R-squared:                  0.270
Method:                 Least Squares   F-statistic:                     36.39
Date:                Sun, 27 Nov 2016   Prob (F-statistic):               0.00
Time:                        18:08:00   Log-Likelihood:            -1.2808e+05
No. Observations:               19174   AIC:                         2.566e+05
Df Residuals:                   18973   BIC:                         2.581e+05
Df Model:                         200                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1         -1009.6857    260.219     -3.880      0.0

R square is 0.277.  There are some strong multicollinearity problems.

In [25]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)
print "Train Score", model.score(x_train, y_train)
print "Test Score", model.score(x_test, y_test)

Train Score 0.277240858765
Test Score 0.331602757153


Something weird is happening here.  The score depends a lot on the random split, and for this split the test set does better than the training set!

In [26]:
for r in range(10):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=r)
    model = LinearRegression()
    model.fit(x_train, y_train)
    print "Train Score", model.score(x_train, y_train)
    print "Test Score", model.score(x_test, y_test)

Train Score 0.277240858765
Test Score 0.331602757153
Train Score 0.30224589911
Test Score 0.264131534085
Train Score 0.295820795368
Test Score 0.280326816692
Train Score 0.299771102086
Test Score -2.42176490768e+17
Train Score 0.309299550687
Test Score 0.258889353127
Train Score 0.29682687432
Test Score -3.31253209e+16
Train Score 0.303439053758
Test Score 0.263564351214
Train Score 0.301192833126
Test Score 0.270361512774
Train Score 0.291761521401
Test Score 0.29446008763
Train Score 0.310924874404
Test Score -6.15138837241e+14


In [28]:
model.coef_

array([  1.22178211e+07,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -4.20395096e+10,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,  -2.82334911e+11,  -2.82334911e+11,
        -2.82334911e+11,