In [106]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn import metrics
import xgboost as xgb

from scipy.stats import pearsonr

%matplotlib inline
pd.set_option('display.max_columns', 100)

In [98]:
pair_ca = pd.read_csv('../data/pair_ca.csv')
univ_ca = pd.read_csv('../data/university_ca.csv')

In [99]:
pair_ca = pair_ca[pair_ca['distance'] < 10]

In [100]:
features = ['UniversityName', 'saleprice', 'yearbuilt', 'numbed', 'num_bath_full', 'num_bath_part', 'garage', 
            'size', 'pool', 'style', 'gatedCommunity', 'renovation', 'distance']
pair_ca = pair_ca[features]

print pair_ca.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8969 entries, 0 to 16156
Data columns (total 13 columns):
UniversityName    8969 non-null object
saleprice         8969 non-null int64
yearbuilt         8969 non-null object
numbed            8969 non-null int64
num_bath_full     8969 non-null int64
num_bath_part     8969 non-null int64
garage            8969 non-null object
size              8969 non-null int64
pool              8969 non-null int64
style             8848 non-null object
gatedCommunity    8969 non-null int64
renovation        8969 non-null int64
distance          8969 non-null float64
dtypes: float64(1), int64(8), object(4)
memory usage: 981.0+ KB
None


In [101]:
# convert data type, clean up missing values
pair_ca = pair_ca[pair_ca['size'] > 0]
pair_ca = pair_ca[pair_ca['saleprice'] > 0]

pair_ca['yearbuilt'] = pd.to_numeric(pair_ca['yearbuilt'], downcast='integer', errors='coerce')
pair_ca['numbed'] = pair_ca['numbed'].replace({-1:0})
pair_ca['num_bath_full'] = pair_ca['num_bath_full'].replace({-1:0})
pair_ca['num_bath_part'] = pair_ca['num_bath_part'].replace({-1:0, 1:0.5})
pair_ca['garage'] = pair_ca['garage'].replace({'\\N':0, '1':1})
pair_ca['numbath'] = pair_ca['num_bath_full'] + pair_ca['num_bath_part']
pair_ca.drop(['num_bath_full', 'num_bath_part'], axis = 1, inplace = True)

# label encoding style
le = preprocessing.LabelEncoder()
pair_ca['style'] = le.fit_transform(pair_ca['style'])

pair_ca['distance2'] = np.log(pair_ca['distance'])
pair_ca.dropna(inplace=True)

print pair_ca.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4322 entries, 2 to 16156
Data columns (total 13 columns):
UniversityName    4322 non-null object
saleprice         4322 non-null int64
yearbuilt         4322 non-null float64
numbed            4322 non-null int64
garage            4322 non-null int64
size              4322 non-null int64
pool              4322 non-null int64
style             4322 non-null int64
gatedCommunity    4322 non-null int64
renovation        4322 non-null int64
distance          4322 non-null float64
numbath           4322 non-null float64
distance2         4322 non-null float64
dtypes: float64(4), int64(8), object(1)
memory usage: 472.7+ KB
None


In [102]:
print pearsonr(pair_ca['numbed'], pair_ca['numbath'])

(0.49863561173915216, 1.663697511190432e-270)


In [103]:
data = []

# find the top 5 university groups with most homes nearby
ca_grouped = pair_ca.groupby('UniversityName')
home_count = ca_grouped.size()
home_count.sort_values(inplace=True,ascending=False)
home_count_top5 = home_count[:5]
univ_names = [name for name in home_count_top5.index]
print home_count_top5

for univ_name in univ_names:
    one_univ = ca_grouped.get_group(univ_name)
    one_univ.drop('UniversityName', axis = 1, inplace=True)
    data.append(one_univ)

UniversityName
California State University   - Sacramento     137
Soka University of America                     125
William Jessup University                      119
California State University   - Bakersfield    115
California State University   - San Marcos     109
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [104]:
print data[0].head()

       saleprice  yearbuilt  numbed  garage  size  pool  style  \
13773     799900     1987.0       3       1  2159     0      8   
13784     309900     2012.0       4       1  1654     0      8   
13792     499000     2008.0       5       1  3314     1      8   
13813     329950     1989.0       4       1  2235     0      8   
13823     169953     1990.0       4       1  1848     0      8   

       gatedCommunity  renovation  distance  numbath  distance2  
13773               1           0  8.410863      3.0   2.129524  
13784               0           0  7.973355      2.0   2.076105  
13792               0           0  8.089785      4.0   2.090602  
13813               0           0  7.936815      3.0   2.071512  
13823               0           0  6.653039      2.5   1.895074  


In [105]:
# linear regression
model = linear_model.LinearRegression()

for i, df in enumerate(data):
    df = (df - df.mean()) / df.std()
    train, test = train_test_split(df, test_size=0.2, random_state=111)
    trainY, trainX = train.iloc[:,0], train.iloc[:,1:]
    testY, testX = test.iloc[:,0], test.iloc[:,1:]
    trainY = np.reshape(trainY, (-1,1))
    testY = np.reshape(testY, (-1,1))
    
    model.fit(trainX, trainY)
    print univ_names[i]
    print model.coef_
    print np.sqrt(metrics.mean_squared_error(testY, model.predict(testX)))

California State University   - Sacramento
[[-0.17938422 -0.41158743 -0.04017058  0.90263667 -0.00133416  0.13179607
   0.07100701  0.10437577 -0.2602238   0.25284964  0.10703264]]
0.518238343804
Soka University of America
[[-0.13314548 -0.15450216 -0.02986479  0.67410808 -0.00434575 -0.00233445
   0.02725416  0.02040986 -0.01037731  0.3244468  -0.05648996]]
1.04002727302
William Jessup University
[[-0.14144154 -0.26398609  0.03149505  0.87438443  0.06964298  0.05525701
   0.12628503 -0.0176815  -0.05230868  0.31560617  0.0255975 ]]
0.65045807652
California State University   - Bakersfield
[[ 0.00550623 -0.17234283  0.04271527  0.9709767  -0.01150665  0.14183815
   0.13078796  0.08246322 -0.14741859  0.1074978   0.02986637]]
0.920257063232
California State University   - San Marcos
[[-0.0577041  -0.1144288  -0.07968521  1.13945373  0.04904969  0.05163776
  -0.03635278 -0.04955248  0.10878801 -0.1356978  -0.05932802]]
0.405328428934


In [125]:
# xgboost
params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.05,
          "max_depth": 5,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301,
          "nthread" : 4
          }
num_boost_round = 100


for d in data:
    print("Train a XGBoost model")
    X_train, X_valid, y_train, y_valid = train_test_split(d.iloc[:,1:], d.iloc[:,0], test_size=0.2, random_state=111)

    # y_train = np.log1p(X_train.SalePrice)
    # y_valid = np.log1p(X_valid.SalePrice)
    # dtrain = xgb.DMatrix(X_train[features], y_train)
    # dvalid = xgb.DMatrix(X_valid[features], y_valid)

    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    # gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
    #   early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=100, verbose_eval=False)

    print("Validating")
    yhat = gbm.predict(xgb.DMatrix(X_valid))
    print np.sqrt(metrics.mean_squared_error(y_valid, yhat))
    score = gbm.get_fscore()
    print sorted(score.items(), key=lambda t:t[1], reverse=True)

Train a XGBoost model
Validating
132494.403235
[('size', 325), ('yearbuilt', 270), ('distance', 202), ('numbed', 121), ('distance2', 93), ('numbath', 67), ('style', 43), ('renovation', 29), ('garage', 27), ('pool', 25), ('gatedCommunity', 20)]
Train a XGBoost model
Validating
975024.120858
[('yearbuilt', 272), ('size', 260), ('distance', 170), ('numbath', 151), ('numbed', 102), ('distance2', 83), ('garage', 34), ('gatedCommunity', 30), ('style', 23), ('pool', 23), ('renovation', 10)]
Train a XGBoost model
Validating
158295.84171
[('size', 303), ('distance', 284), ('yearbuilt', 197), ('numbed', 113), ('numbath', 83), ('distance2', 70), ('style', 36), ('pool', 33), ('gatedCommunity', 25), ('renovation', 21), ('garage', 12)]
Train a XGBoost model
Validating
145626.841611
[('size', 312), ('yearbuilt', 238), ('distance', 215), ('numbath', 76), ('numbed', 75), ('distance2', 54), ('renovation', 35), ('garage', 29), ('style', 28), ('pool', 26), ('gatedCommunity', 13)]
Train a XGBoost model
Val