In [74]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from geopy.distance import geodesic
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
E_Net = ElasticNet(alpha=0.1, l1_ratio=.5, random_state=42, tol=0.1)
LG = LinearRegression()
RF_Regressor = RandomForestRegressor()
GBR = GradientBoostingRegressor()
tree_reg = DecisionTreeRegressor(random_state=42)

In [3]:
data = pd.read_pickle('Regression_data.pkl')
data = data.loc[((data['Number of rooms']>0) & 
                 (data['Number of rooms']<=9) &
                 (data['Transaction cost']<=2e6) &
                 (data['Transaction cost']>=6e4))]
data.head(5)

Unnamed: 0,Transaction date,Transaction cost,Postcode,Type of property,Built surface,Number of rooms,Ground surface,coord
0,8.0,251500.0,1310.0,1.0,147.0,5.0,1501.0,"(46.247576, 5.119191000000001)"
2,8.0,174500.0,1000.0,0.0,80.0,2.0,0.0,"(46.204017, 5.218208)"
4,1.0,157500.0,1440.0,1.0,103.0,4.0,1569.0,"(46.236276000000004, 5.2627559999999995)"
7,6.0,90000.0,1000.0,0.0,61.0,2.0,0.0,"(46.200396999999995, 5.219821)"
11,13.0,95000.0,1000.0,0.0,58.0,3.0,0.0,"(46.204702000000005, 5.210812000000001)"


In [11]:
def pricePrediction(obj, model):
    data_selected = data_train[(data_train['Postcode'] > (obj['Postcode'] - 2)) 
                & (data_train['Postcode'] < (obj['Postcode'] + 2)) 
                & (data_train['Type of property'] == obj['Type of property'])]
    coord = obj.coord

    def calc_distance(x):
        site1_coords = coord
        site2_coords = x.coord
        return geodesic(site1_coords, site2_coords).km

    data_selected['distance'] = data_selected.apply(calc_distance, axis=1)
    data_selected = data_selected.sort_values(by="distance").head(150)

    X = data_selected[['Transaction date','Built surface','Ground surface','Number of rooms']]
    y = data_selected['Transaction cost']

    model.fit(X, y)
    input = obj[['Transaction date','Built surface','Ground surface','Number of rooms']]
    input = np.array(input).reshape(1, -1)
    price_prediction = model.predict(input)

    return price_prediction[0]

In [61]:
data_test = data.sample(frac = 0.0003)
  
data_train = data.drop(data_test.index)

In [62]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1771 entries, 1440669 to 2013694
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Transaction date  1771 non-null   float64
 1   Transaction cost  1771 non-null   float64
 2   Postcode          1771 non-null   float64
 3   Type of property  1771 non-null   float64
 4   Built surface     1771 non-null   float64
 5   Number of rooms   1771 non-null   float64
 6   Ground surface    1771 non-null   float64
 7   coord             1771 non-null   object 
dtypes: float64(7), object(1)
memory usage: 124.5+ KB


# Test final 

## Random forest

In [63]:
data_test['Transaction cost prediction'] = data_test.apply(lambda x : pricePrediction(x, RF_Regressor), axis=1)
data_test['Prediction error'] = data_test.apply(lambda x:
                            (np.abs(x['Transaction cost'] - x['Transaction cost prediction'])),axis=1)
data_test['Relative error'] = data_test.apply(lambda x:
                            (x['Prediction error'] / x['Transaction cost']),axis=1)

In [64]:
data_test.describe()

Unnamed: 0,Transaction date,Transaction cost,Postcode,Type of property,Built surface,Number of rooms,Ground surface,Transaction cost prediction,Prediction error,Relative error
count,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0
mean,1356.889328,227402.7,52990.111801,0.557877,80.226426,3.511011,399.551666,225229.4,44004.09,0.238181
std,687.306291,188185.4,27052.168122,0.496779,36.950939,1.46147,577.70311,166841.4,68224.07,0.37765
min,1.0,60000.0,1170.0,0.0,10.0,1.0,0.0,61667.0,0.0,0.0
25%,799.5,120000.0,32430.0,0.0,53.0,2.0,0.0,127625.7,9049.8,0.053339
50%,1386.0,170000.0,54220.0,1.0,77.0,4.0,203.0,173353.8,24000.8,0.138685
75%,1934.0,263185.0,76620.0,1.0,103.0,4.0,554.0,261177.1,50337.9,0.286354
max,2555.0,1540000.0,97460.0,1.0,190.0,8.0,4236.0,1448500.0,1022790.0,5.581159


## Gradient boosting regression

In [65]:
data_test['Transaction cost prediction'] = data_test.apply(lambda x : pricePrediction(x, GBR), axis=1)
data_test['Prediction error'] = data_test.apply(lambda x:
                            (np.abs(x['Transaction cost'] - x['Transaction cost prediction'])),axis=1)
data_test['Relative error'] = data_test.apply(lambda x:
                            (x['Prediction error'] / x['Transaction cost']),axis=1)

In [66]:
data_test.describe()

Unnamed: 0,Transaction date,Transaction cost,Postcode,Type of property,Built surface,Number of rooms,Ground surface,Transaction cost prediction,Prediction error,Relative error
count,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0
mean,1356.889328,227402.7,52990.111801,0.557877,80.226426,3.511011,399.551666,226446.9,43603.025766,0.238418
std,687.306291,188185.4,27052.168122,0.496779,36.950939,1.46147,577.70311,173258.8,67098.613599,0.403613
min,1.0,60000.0,1170.0,0.0,10.0,1.0,0.0,54815.24,0.0,0.0
25%,799.5,120000.0,32430.0,0.0,53.0,2.0,0.0,125603.2,9209.851318,0.051773
50%,1386.0,170000.0,54220.0,1.0,77.0,4.0,203.0,172784.6,22868.252926,0.131954
75%,1934.0,263185.0,76620.0,1.0,103.0,4.0,554.0,260062.7,51050.245884,0.286189
max,2555.0,1540000.0,97460.0,1.0,190.0,8.0,4236.0,1485598.0,821677.140387,6.568242


## Decision tree regression

In [35]:
data_test['Transaction cost prediction'] = data_test.apply(lambda x : pricePrediction(x, tree_reg), axis=1)
data_test['Prediction error'] = data_test.apply(lambda x:
                            (np.abs(x['Transaction cost'] - x['Transaction cost prediction'])),axis=1)
data_test['Relative error'] = data_test.apply(lambda x:
                            (x['Prediction error'] / x['Transaction cost']),axis=1)

In [36]:
data_test.describe()

Unnamed: 0,Transaction date,Transaction cost,Postcode,Type of property,Built surface,Number of rooms,Ground surface,Transaction cost prediction,Prediction error,Relative error
count,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0
mean,1319.377966,229298.4,53522.245763,0.559322,81.825424,3.547458,389.09322,223065.2,56315.49,0.269108
std,698.777665,202425.8,27260.069294,0.49689,37.296386,1.486882,558.221399,181575.8,99372.82,0.408503
min,15.0,60000.0,1130.0,0.0,14.0,1.0,0.0,60000.0,0.0,0.0
25%,744.25,116050.0,31417.5,0.0,55.25,3.0,0.0,119125.0,5000.0,0.035594
50%,1359.0,167000.0,57460.0,1.0,79.0,4.0,199.0,170000.0,25000.0,0.156011
75%,1884.75,270000.0,77530.0,1.0,103.0,5.0,592.0,260000.0,63262.5,0.341727
max,2554.0,1600000.0,97490.0,1.0,191.0,9.0,4090.0,1500000.0,1103400.0,3.868182


## Linear regression

In [37]:
data_test['Transaction cost prediction'] = data_test.apply(lambda x : pricePrediction(x, LG), axis=1)
data_test['Prediction error'] = data_test.apply(lambda x:
                            (np.abs(x['Transaction cost'] - x['Transaction cost prediction'])),axis=1)
data_test['Relative error'] = data_test.apply(lambda x:
                            (x['Prediction error'] / x['Transaction cost']),axis=1)

In [38]:
data_test.describe()

Unnamed: 0,Transaction date,Transaction cost,Postcode,Type of property,Built surface,Number of rooms,Ground surface,Transaction cost prediction,Prediction error,Relative error
count,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0
mean,1319.377966,229298.4,53522.245763,0.559322,81.825424,3.547458,389.09322,230953.7,56879.94,0.306868
std,698.777665,202425.8,27260.069294,0.49689,37.296386,1.486882,558.221399,168797.2,81286.17,0.418995
min,15.0,60000.0,1130.0,0.0,14.0,1.0,0.0,-70353.87,19.25847,0.000108
25%,744.25,116050.0,31417.5,0.0,55.25,3.0,0.0,133366.2,16108.49,0.08579
50%,1359.0,167000.0,57460.0,1.0,79.0,4.0,199.0,179102.0,33266.36,0.190107
75%,1884.75,270000.0,77530.0,1.0,103.0,5.0,592.0,268274.1,69300.32,0.355468
max,2554.0,1600000.0,97490.0,1.0,191.0,9.0,4090.0,1452202.0,1042620.0,4.605916


## Elastic net regression

In [39]:
data_test['Transaction cost prediction'] = data_test.apply(lambda x : pricePrediction(x, E_Net), axis=1)
data_test['Prediction error'] = data_test.apply(lambda x:
                            (np.abs(x['Transaction cost'] - x['Transaction cost prediction'])),axis=1)
data_test['Relative error'] = data_test.apply(lambda x:
                            (x['Prediction error'] / x['Transaction cost']),axis=1)

In [40]:
data_test.describe()

Unnamed: 0,Transaction date,Transaction cost,Postcode,Type of property,Built surface,Number of rooms,Ground surface,Transaction cost prediction,Prediction error,Relative error
count,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0
mean,1319.377966,229298.4,53522.245763,0.559322,81.825424,3.547458,389.09322,231158.4,56728.25,0.30665
std,698.777665,202425.8,27260.069294,0.49689,37.296386,1.486882,558.221399,169133.4,81140.1,0.419906
min,15.0,60000.0,1130.0,0.0,14.0,1.0,0.0,-85761.17,3.317892,2e-05
25%,744.25,116050.0,31417.5,0.0,55.25,3.0,0.0,134154.9,15609.15,0.082519
50%,1359.0,167000.0,57460.0,1.0,79.0,4.0,199.0,179177.4,32818.41,0.190017
75%,1884.75,270000.0,77530.0,1.0,103.0,5.0,592.0,268687.5,68278.33,0.357868
max,2554.0,1600000.0,97490.0,1.0,191.0,9.0,4090.0,1475017.0,1044644.0,4.606037


## While?

In [71]:
def estimator(obj, model):
    data_geo = data_train[(data_train['Postcode'] >(obj["Postcode"]-2))
                  &(data_train['Postcode'] <(obj["Postcode"]+2))
                  &(data_train['Type of property'] == obj['Type of property'])]
    data_geo=data_geo.assign(distance=2)
    data_geo['distance'] = data_geo.apply(calc_distance, axis = 1)
    
    n=20
    min_rmse = 1000000000
    estimation = 0
    
    while n<900:
        data_geo_reg=data_geo.sort_values(by="distance").head(n)
        X = data_geo_reg[['Transaction date','Built surface','Ground surface', 'Number of rooms']]
        y = data_geo_reg['Transaction cost']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=9)
        
        model.fit(X_train, y_train) 
        pred = model.predict(X_test)
        test_set_rmse = (np.sqrt(mean_squared_error(y_test, pred))) 
        if test_set_rmse < min_rmse:
            min_rmse=test_set_rmse
            input = obj[['Transaction date','Built surface','Ground surface','Number of rooms']]
            input = np.array(input).reshape(1, -1)
            estimation = model.predict(input)
            n_min=n
        n = n + 30
    return estimation[0]

In [75]:
data_test['Transaction cost prediction'] = data_test.apply(lambda x : estimator(x, RF_Regressor), axis=1)
data_test['Prediction error'] = data_test.apply(lambda x:
                            (np.abs(x['Transaction cost'] - x['Transaction cost prediction'])),axis=1)
data_test['Relative error'] = data_test.apply(lambda x:
                            (x['Prediction error'] / x['Transaction cost']),axis=1)

140
[91825.]
80
[1212540.]
830
[103632.6538]
50
[188773.1222]
140
[376600.]
20
[241000.]
20
[237670.]
20
[199654.]
80
[182697.2]
230
[331864.3]
20
[213470.]
80
[152000.]
650
[171376.78]
20
[611861.]
20
[244674.]
20
[113439.4]
260
[130868.35]
20
[160415.22]
50
[108466.]
20
[747600.]
50
[295861.2]
20
[136702.]
50
[149251.5]
20
[182112.]
80
[178743.]
50
[466530.]
50
[217239.]
20
[674820.9]
140
[309907.5]
200
[70038.7]
50
[401706.]
20
[164028.874]
350
[144891.7]
230
[186240.]
110
[130320.]
20
[226840.5]
20
[253915.]
110
[93674.]
290
[144534.84]
140
[108754.31]
80
[264714.]
290
[105216.2]
20
[969498.]
20
[123129.]
140
[118171.04]
230
[106652.]
50
[175041.3]
470
[139133.44]
50
[208396.7]
20
[250830.]
20
[84040.]
50
[98550.]
50
[423751.3356]
140
[297648.]
50
[672270.4]
20
[152119.5]
110
[98785.8]
50
[135433.]
170
[212737.6]
20
[96964.34]
20
[117015.]
20
[118227.31]
230
[250987.49]
320
[130530.]
80
[898935.4]
20
[292244.]
20
[135370.]
20
[91652.01]
140
[259177.04]
350
[132793.46]
20
[147580.]


In [80]:
data_test.describe()

Unnamed: 0,Transaction date,Transaction cost,Postcode,Type of property,Built surface,Number of rooms,Ground surface,Transaction cost prediction,Prediction error,Relative error
count,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0,1771.0
mean,1356.889328,227402.7,52990.111801,0.557877,80.226426,3.511011,399.551666,223471.0,44074.025137,0.229172
std,687.306291,188185.4,27052.168122,0.496779,36.950939,1.46147,577.70311,168944.4,66119.011249,0.333025
min,1.0,60000.0,1170.0,0.0,10.0,1.0,0.0,62355.0,0.0,0.0
25%,799.5,120000.0,32430.0,0.0,53.0,2.0,0.0,126071.1,9420.95,0.05581
50%,1386.0,170000.0,54220.0,1.0,77.0,4.0,203.0,173150.0,23400.0,0.135378
75%,1934.0,263185.0,76620.0,1.0,103.0,4.0,554.0,258069.1,54476.0,0.291526
max,2555.0,1540000.0,97460.0,1.0,190.0,8.0,4236.0,1448500.0,889964.3,4.438496
