In [45]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [2]:
train = pd.read_csv('train_df_fe_all.csv')

# Some feature Visualizations

## Storey_range

In [None]:
a4_dims = (8, 8)
fig, ax = pyplot.subplots(figsize=a4_dims)
train.groupby('storey_range').size().plot(kind='pie', autopct='%.2f')
ax.set_title('Pie Chart of storey_range')

In [None]:
a4_dims = (8, 8)
fig, ax = pyplot.subplots(figsize=a4_dims)
ax = sns.boxplot(x="storey_range", y="resale_price", data=train)
ax.set_title('resale_price by storey_range')

## Floor area (size)

In [None]:
train1 = train.sample(frac=0.01, replace=False, random_state=1)

a4_dims = (8, 8)
fig, ax = pyplot.subplots(figsize=a4_dims)
plt.scatter(train1.floor_area_sqm,train1.resale_price)
ax.set_title('resale_price by size')
plt.xlabel('size in sqm')
plt.ylabel('resale_price')

## Flat age

In [None]:
a4_dims = (8, 8)
fig, ax = pyplot.subplots(figsize=a4_dims)
plt.scatter(train1.age_at_sales,train1.resale_price)
ax.set_title('resale_price by age')
plt.xlabel('age of flat at sales')
plt.ylabel('resale_price')

## Flat Type

In [None]:
a4_dims = (9, 9)
fig, ax = pyplot.subplots(figsize=a4_dims)
train.groupby('flat_type').size().plot(kind='pie', autopct='%.2f')
ax.set_title('Pie Chart of flat_type')

In [None]:
a4_dims = (8, 8)
fig, ax = pyplot.subplots(figsize=a4_dims)
ax = sns.boxplot(x="flat_type", y="resale_price", data=train)
ax.set_title('resale_price by flat_type')

## Flat model

In [None]:
a4_dims = (12, 9)
fig, ax = pyplot.subplots(figsize=a4_dims)
ax = sns.boxplot(x="flat_model", y="resale_price", data=train)
ax.set_title('resale_price by flat_model')
ax.set_xticklabels(ax.get_xticklabels(),rotation = 50)

## Block

In [None]:
a4_dims = (8, 8)
fig, ax = pyplot.subplots(figsize=a4_dims)
train.groupby('block').size().plot(kind='pie', autopct='%.2f')
ax.set_title('Pie Chart of blocks with number "4"')

## Region

In [None]:
a4_dims = (8, 8)
fig, ax = pyplot.subplots(figsize=a4_dims)
ax = sns.boxplot(x="region", y="resale_price", data=train)
ax.set_title('resale_price by region')

## Planning area

In [None]:
a4_dims = (12, 9)
fig, ax = pyplot.subplots(figsize=a4_dims)
ax = sns.boxplot(x="planning_area", y="resale_price", data=train)
ax.set_title('resale_price by planning_area')
ax.set_xticklabels(ax.get_xticklabels(),rotation = 50)

In [None]:
cate_cols = [
        "flat_type",
        "street_name",
        "storey_range",
        "flat_model",
        "subzone",
        "planning_area",
        "region",
        "resale_quarter",
        "resale_month"
    ]
num_cols= [
        "block",
        "flat_age",
        "resale_year",
        "floor_area_sqm",
        "latitude",
        "longitude"
    ]
aux_cols= [
        "commercial_CBD",
        "commercial_type_CR",
        "commercial_type_IEBP",
        "commercial_type_IEPB",
        "commercial_type_BN",
        "commercial_type_IHL",
        "hawker_ECLFV",
        "hawker_NFC",
        "hawker_CRB89",
        "hawker_OARB51OARFCSM",
        "hawker_CRB",
        "hawker_HVMFC",
        "hawker_BFC",
        "hawker_CCFC",
        "hawker_TBM",
        "hawker_BPHC",
        "hawker_GMFC",
        "hawker_YPHC",
        "hawker_OTH",
        "hawker_KAHC",
        "hawker__",
        "hawker_highrating_",
        "hawker_established_",
        "malls_GWC",
        "malls_IO",
        "malls_TSMBS",
        "malls_NAC",
        "malls_PS",
        "malls_SC",
        "malls_OTH",
        "malls_CA",
        "malls_JCA",
        "malls_VivoCity",
        "malls_JP",
        "malls__",
        "malls_ratingsbin_4.1",
        "malls_ratingsbin_4.3",
        "malls_ratingsbin_>4.0",
        "malls_ratingsbin_4.2",
        "malls_ratingsbin_4.0",
        "malls_ratingsbin_>=4.5",
        "malls_ratingsbin_4.4",
        "malls_established_",
        "station_type_mrt",
        "station_type_other",
        "station_interchange_",
        "station_EW_",
        "station_NS_",
        "station_NE_",
        "station_CC_",
        "station_DT_",
        "prisch_top50_",
        "prisch_top50_<=1km",
        "prisch_top50_1to2km",
        "prisch_top50_2to4km",
        "demographics_elderly",
        "demographics_kids",
        "demographics_middle",
        "demographics_older",
        "demographics_youngads",
        "demographics_youth",
        "annual_TLF",
        "annual_TEP",
        "annual_TUP",
        "annual_TURC",
        "annual_RURC",
        "annual_TP",
        "annual_AMHIFWPHMECC",
        "quarterly_GICD",
        "quarterly_ULCOOE",
        "quarterly_HPI",
        "monthly_PLR",
        "monthly_SDR",
        "monthly_CPI"
    ]

In [22]:
cate = train[cate_cols]
num = train[num_cols]
aux = train[aux_cols]

combined = [cate,num,aux]
train_final_all = pd.concat(combined, axis = 1)
train_final_all.shape

(431732, 89)

In [37]:
# one hot encoding
train_dummies = pd.get_dummies(train_final_all, columns = cate_cols)

In [42]:
# y columns
train_y_all = train['resale_price']

In [46]:
# train-validation split
X_train, X_val, y_train, y_val = train_test_split(train_dummies, train_y_all, test_size=0.25, random_state=0)

# XGBoost training

In [55]:
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns.values]
X_val.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_val.columns.values]

In [95]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.5, learning_rate = 0.05,
                max_depth = 16, alpha = 0.1, n_estimators = 1000)

In [None]:
%%time

xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_val)



In [None]:
rmse = np.sqrt(mean_squared_error(y_val, preds))
print("RMSE: %f" % (rmse))

# testing

In [63]:
test1=pd.read_csv('test_df_fe_all.csv')

In [65]:
cate_test = test1[cate_cols]
num_test = test1[num_cols]
aux_test = test1[aux_cols]

combined_test = [cate_test,num_test,aux_test]
test_final_all = pd.concat(combined_test, axis = 1)
test_final_all.shape

(107934, 89)

In [66]:
# one hot encoding
test_dummies = pd.get_dummies(test_final_all, columns = cate_cols)

In [None]:
training_columns_names = X_train.columns.tolist()
test_columns_names = test_dummies.columns.tolist()

# features not in the test columns
not_in = [x for x in training_columns_names if x not in test_columns_names]
not_in

In [77]:
dict_test = {}
for i in range(11):
    value = not_in[i]
    dict_test[i] = value

zero_test = np.zeros((test_dummies.shape[0],11))
zero_test_pd = pd.DataFrame(zero_test)
zero_test_pd.rename(columns = dict_test, inplace = True)
test_final_all_all = pd.concat([test_dummies,zero_test_pd],axis=1)
test_final_all_all.shape
test_dummies.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in test_dummies.columns.values]

(107934, 1428)

In [80]:
te = test_final_all_all[X_train.columns.tolist()]
preds_test = xg_reg.predict(te)
pr= pd.DataFrame(preds_test)
pr.to_csv("pr.csv",index=False)