# Real Estate Price Prediction

This is one of the solutions of the course project https://www.kaggle.com/c/realestatepriceprediction/overview) 

In this competition, the task is to predict the price of flats in test.csv. There are two datasets: train.csv (contains all features and prices of flats) and test.csv (only features)

Metric: R**2

Type of the task: Regression

***

The idea is that using the raw dataset, train the model (I will use LightGBM regressor with an internal categorical features preprocessor, fixing the values of the score via 5-fold cross-validations) and remove the lines whose predicted target values are very different from the real one.
Then train a new model on the entire remaining train and make predictions on the test. 

As practice has shown, the result on the leaderboard is quite good, given that the notebook contains only a few cells :)

***

In [1]:
import datetime # for %%time

import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import xgboost as xgb

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict

from lightgbm import LGBMRegressor

%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print('Lines in train:' ,  train.shape[0])
print('Lines in test', test.shape[0])

Lines in train: 10000
Lines in test 5000


In [3]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [4]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.5+ MB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [7]:
def cat_featutes(data):
  cat_features = list(data.select_dtypes(include=[np.object]).columns)
  cat_features = cat_features + ['DistrictId']
  return cat_features

def cat_featutes_maker(data, cat_features):    
  data[cat_features] = data[cat_features].astype('category')
  return data

In [8]:
cat_features = cat_featutes(train)

In [9]:
train = cat_featutes_maker(train, cat_features)
test = cat_featutes_maker(test, cat_features)

In [10]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [11]:
params = {
    'boosting_type': 'gbdt',
    'n_estimators': 1900,
    'learning_rate': 0.005134,
    'num_leaves': 54,
    'max_depth': 10,
    'subsample_for_bin': 240000,
    'reg_alpha': 0.436193,
    'reg_lambda': 0.479169,
    'colsample_bytree': 0.508716,
    'min_split_gain': 0.024766,
    'subsample': 0.7,
    'is_unbalance': False,
    'random_state': 27,
    'silent': -1,
    'verbose': -1
}

In [12]:
model = LGBMRegressor(**params)

In [13]:
cv = cross_val_score(
    estimator=model,
    X=train.drop(["Price"], axis=1),
    y=train["Price"],
    scoring="r2",
    cv=5
)

print(f"CV-results: {round(np.mean(cv), 4)} +/- {round(np.std(cv), 3)}")

CV-results: 0.7516 +/- 0.017


In [14]:
pred = cross_val_predict(
    estimator=model,
    X=train.drop(["Price"], axis=1),
    y=train["Price"],
    #scoring="r2",
    cv=5
)

In [15]:
pred

array([202631.79054205, 273487.45461925, 222316.349696  , ...,
       138666.80849655, 164142.19986771, 178153.37439906])

In [16]:
train['price_pred'] = pred

In [17]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,price_pred
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,...,B,33,7976,5,,0,11,B,184966.93073,202631.790542
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,...,B,46,10309,1,240.0,1,16,B,300009.450063,273487.454619
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,...,B,34,7759,0,229.0,1,3,B,220925.908524,222316.349696
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,...,B,23,5735,3,1084.0,0,5,B,175616.227217,172061.990665
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,...,B,35,5776,1,2078.0,2,4,B,150226.531644,162261.633275


In [18]:
train['pr_diff'] = train['Price'] / train['price_pred']

In [19]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,price_pred,pr_diff
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,...,33,7976,5,,0,11,B,184966.93073,202631.790542,0.912823
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,...,46,10309,1,240.0,1,16,B,300009.450063,273487.454619,1.096977
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,...,34,7759,0,229.0,1,3,B,220925.908524,222316.349696,0.993746
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,...,23,5735,3,1084.0,0,5,B,175616.227217,172061.990665,1.020657
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,...,35,5776,1,2078.0,2,4,B,150226.531644,162261.633275,0.925829


***

The target values, after which the lines are deleted, are selected empirically based on the values obtained on the Kaggle leaderboard.

***

In [20]:
train[train['pr_diff']>1.35]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,price_pred,pr_diff
20,8862,0,3.0,103.605292,47.799426,10.0,12,17.0,2002,0.319809,...,25,4756,16,2857.0,5,8,B,521919.057400,385953.202772,1.352286
39,9371,23,2.0,60.503248,,0.0,16,0.0,1977,0.034656,...,0,168,0,,0,0,B,229778.057902,130513.179488,1.760574
92,1671,120,3.0,58.570371,37.434126,5.0,4,5.0,1961,0.341072,...,27,5664,48,2300.0,3,11,B,305732.586295,218999.124857,1.396045
150,8141,46,2.0,69.804612,33.829660,12.0,13,16.0,2014,0.188784,...,31,6137,4,,0,1,B,465067.065883,252224.792410,1.843859
212,1748,88,2.0,5.497061,67.628717,1.0,24,22.0,1977,0.127376,...,43,8429,3,,3,9,B,412511.088764,248950.746180,1.656999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9613,7657,38,1.0,4.502784,4.618547,1.0,28,1.0,1977,0.060753,...,15,2787,2,520.0,0,7,B,483283.488083,200951.555418,2.404975
9650,6323,162,3.0,93.815902,61.816690,9.0,6,6.0,1952,0.174739,...,17,3379,9,100.0,0,3,B,624008.535763,350113.181458,1.782305
9712,11118,144,3.0,86.334911,51.875895,12.0,18,17.0,2004,0.306977,...,27,5148,14,1575.0,5,7,B,597921.009105,430101.176012,1.390187
9910,16568,27,4.0,200.334539,201.627361,25.0,1,2.0,2013,0.041116,...,53,14892,4,,1,4,B,528560.506016,315119.567896,1.677333


In [21]:
train[train['pr_diff']<0.65]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,price_pred,pr_diff
11,12427,31,3.0,68.808859,45.736906,7.0,1,8.0,1959,0.000000,...,23,3684,2,,0,4,B,165534.541425,351724.729076,0.470637
15,2823,85,3.0,73.612225,53.892089,6.0,4,9.0,1979,0.037178,...,52,11217,1,2300.0,1,7,B,89084.327876,318009.867021,0.280131
38,8224,177,3.0,77.873936,48.991133,12.0,9,25.0,2012,0.000000,...,36,6714,2,2672.0,0,2,B,191550.121357,358045.820703,0.534988
62,7619,47,2.0,54.787324,33.512143,8.0,1,12.0,1978,0.000070,...,46,10309,1,240.0,1,16,B,100471.952613,208835.226961,0.481106
82,2952,169,2.0,58.433607,31.086989,8.0,3,14.0,1970,0.210473,...,11,2398,2,1994.0,3,0,B,116661.595490,249820.080790,0.466982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9935,6608,13,2.0,59.384809,36.738844,10.0,2,16.0,1982,0.090799,...,74,19083,2,,5,15,B,85665.844941,249556.463458,0.343272
9960,9307,1,2.0,57.376642,,0.0,8,25.0,2016,0.007122,...,1,264,0,,0,1,B,104450.496477,186951.229983,0.558705
9962,4660,158,2.0,63.131655,41.867913,8.0,9,8.0,1939,0.042032,...,37,6856,84,1940.0,2,5,B,87268.098968,428875.325452,0.203481
9968,6653,17,2.0,41.157437,27.489713,6.0,3,19.0,1967,0.000000,...,18,3374,5,620.0,1,2,B,87265.399045,220550.086997,0.395672


In [22]:
train = train.loc[train['pr_diff'] > 0.65]

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9584 entries, 0 to 9999
Data columns (total 22 columns):
Id               9584 non-null int64
DistrictId       9584 non-null category
Rooms            9584 non-null float64
Square           9584 non-null float64
LifeSquare       7497 non-null float64
KitchenSquare    9584 non-null float64
Floor            9584 non-null int64
HouseFloor       9584 non-null float64
HouseYear        9584 non-null int64
Ecology_1        9584 non-null float64
Ecology_2        9584 non-null category
Ecology_3        9584 non-null category
Social_1         9584 non-null int64
Social_2         9584 non-null int64
Social_3         9584 non-null int64
Healthcare_1     4948 non-null float64
Helthcare_2      9584 non-null int64
Shops_1          9584 non-null int64
Shops_2          9584 non-null category
Price            9584 non-null float64
price_pred       9584 non-null float64
pr_diff          9584 non-null float64
dtypes: category(4), float64(10), int64(8)
memo

In [24]:
train = train.loc[train['pr_diff'] < 1.35]

In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9383 entries, 0 to 9999
Data columns (total 22 columns):
Id               9383 non-null int64
DistrictId       9383 non-null category
Rooms            9383 non-null float64
Square           9383 non-null float64
LifeSquare       7328 non-null float64
KitchenSquare    9383 non-null float64
Floor            9383 non-null int64
HouseFloor       9383 non-null float64
HouseYear        9383 non-null int64
Ecology_1        9383 non-null float64
Ecology_2        9383 non-null category
Ecology_3        9383 non-null category
Social_1         9383 non-null int64
Social_2         9383 non-null int64
Social_3         9383 non-null int64
Healthcare_1     4835 non-null float64
Helthcare_2      9383 non-null int64
Shops_1          9383 non-null int64
Shops_2          9383 non-null category
Price            9383 non-null float64
price_pred       9383 non-null float64
pr_diff          9383 non-null float64
dtypes: category(4), float64(10), int64(8)
memo

In [26]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,price_pred,pr_diff
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,...,33,7976,5,,0,11,B,184966.93073,202631.790542,0.912823
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,...,46,10309,1,240.0,1,16,B,300009.450063,273487.454619,1.096977
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,...,34,7759,0,229.0,1,3,B,220925.908524,222316.349696,0.993746
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,...,23,5735,3,1084.0,0,5,B,175616.227217,172061.990665,1.020657
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,...,35,5776,1,2078.0,2,4,B,150226.531644,162261.633275,0.925829


In [27]:
train=train.drop(["price_pred", "pr_diff"], axis=1)

In [28]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [29]:
#train["Id"] = train["Id"].astype('category')

In [30]:
#test["Id"] = test["Id"].astype('category')

In [31]:
cv = cross_val_score(
    estimator=model,
    X=train.drop(["Price"], axis=1),
    y=train["Price"],
    scoring="r2",
    cv=5
)

print(f"CV-results: {round(np.mean(cv), 4)} +/- {round(np.std(cv), 3)}")

CV-results: 0.9072 +/- 0.004


In [32]:
x_train = train.drop(["Price"], axis=1)
y_train = train["Price"]

In [33]:
model.fit(
    X=x_train,
    y=y_train,
    eval_set=[(x_train, y_train)],
    verbose=200
)

[200]	training's l2: 2.09621e+09
[400]	training's l2: 9.65328e+08
[600]	training's l2: 6.54711e+08
[800]	training's l2: 5.25832e+08
[1000]	training's l2: 4.57509e+08
[1200]	training's l2: 4.1294e+08
[1400]	training's l2: 3.79184e+08
[1600]	training's l2: 3.52351e+08
[1800]	training's l2: 3.30306e+08


LGBMRegressor(colsample_bytree=0.508716, is_unbalance=False,
              learning_rate=0.005134, max_depth=10, min_split_gain=0.024766,
              n_estimators=1900, num_leaves=54, random_state=27,
              reg_alpha=0.436193, reg_lambda=0.479169, silent=-1, subsample=0.7,
              subsample_for_bin=240000, verbose=-1)

In [34]:
preds_final = pd.DataFrame()
preds_final['Id'] = test['Id'].copy()

In [35]:
y_pred_final = model.predict(test)
preds_final['Price'] = y_pred_final
preds_final.to_csv('predictions_xgb.csv', index=False)

In [36]:
preds_final.head()

Unnamed: 0,Id,Price
0,725,161445.153356
1,15856,224193.801985
2,5480,236942.593344
3,15664,331799.532588
4,14275,140881.995192
