In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('train_data.csv')

In [3]:
df['label1'] = np.exp(df.logerror.values)

In [4]:
y = np.load('label.npy')

In [24]:
df.logerror.plot(kind='kde', label='true')
pd.Series(y).plot(kind='kde', label='predict')
plt.legend(loc='best')
plt.xlim(-0.4,0.4)
plt.grid(True)

plt.show()

In [5]:
df['label']=pd.cut(df.logerror,[-100, -0.05, -0.004, 0.027, 0.05, 0.1, 100], labels=[0,1,2,3,4,5]).astype('i')

In [29]:
df.columns

Index(['ParcelId', 'logerror', 'airconditioningtypeid',
       'architecturalstyletypeid', 'basementsqft', 'bathroomcnt', 'bedroomcnt',
       'buildingclasstypeid', 'buildingqualitytypeid', 'calculatedbathnbr',
       'decktypeid', 'finishedfloor1squarefeet',
       'calculatedfinishedsquarefeet', 'finishedsquarefeet12',
       'finishedsquarefeet13', 'finishedsquarefeet15', 'finishedsquarefeet50',
       'finishedsquarefeet6', 'fips', 'fireplacecnt', 'fullbathcnt',
       'garagecarcnt', 'garagetotalsqft', 'hashottuborspa',
       'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet',
       'poolcnt', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
       'propertycountylandusecode', 'propertylandusetypeid',
       'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt',
       'storytypeid', 'threequarterbathnbr', 'typeconstructiontypeid',
       'unitcnt', 'yardbuildingsq

In [6]:
properties = df.drop(['ParcelId', 'logerror', 'label1', 'label'], axis=1)

In [7]:
for c in properties.columns:
    properties[c]=properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))

In [8]:
y = df.label.values

In [9]:
index = np.arange(y.shape[0])
np.random.shuffle(index)
train = index[:80000]
test = index[80000:]

In [11]:
pd.Series(y[train]).plot(kind='kde')
pd.Series(y[test]).plot(kind='kde')
plt.show()

In [10]:
x_train = properties.iloc[train]
x_test = properties.iloc[test]
y_train = y[train]
y_test = y[test]

In [12]:
dtrain = xgb.DMatrix(x_train, y_train, missing=-1)

dtest = xgb.DMatrix(x_test, y_test, missing=-1)

In [13]:
xgb_params = {
    'eta': 0.01,
    'max_depth': 6,
    'subsample': 1,
    'objective': 'multi:softprob',
    'eval_metric': 'merror',
    'base_score': 0.15,
    'silent': 1,
    'num_class':6
}
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   nfold=5,
                   num_boost_round=200,
                   early_stopping_rounds=5,
                   verbose_eval=10, 
                   show_stdv=False
                  )

[0]	train-merror:0.684384	test-merror:0.706162


In [14]:
num_boost_rounds = len(cv_result)
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

In [16]:
xgb.plot_importance(model)
plt.show()