In [1]:
import os

import numpy as np
import pandas as pd

dir_path  = os.path.dirname(os.path.realpath(__file__))
data_path = os.path.join(dir_path, "data")

## training
file_path = os.path.join(data_path, "python_training.csv")

df = pd.read_csv(file_path, sep=';')
# filtering outliers (> 12ke per sqm)
df = df.loc[df['pricesqm']<12000]
df.head()

def days_from_present(x):
    measureDay = pd.Timestamp('20190101')
    td = pd.to_datetime(x,yearfirst=True) - measureDay
    return td.days

def processing(df):
    # handling dates
    df['date'] = '2000-01-01'
    if type(df) == type(pd.DataFrame()):
        df['date'] = df['date'].apply(days_from_present)
    else:
        df['date'] = days_from_present(df['date'])

    # encoding asset category column
    if type(df) == type(pd.DataFrame()):
        types = ['Housing / Retail', 'Light Industrial', 'Light Industrial / Office', 'Office', 'Retail']
        for t in types:
            df[t] = 0
            df.loc[df['asset_type']==t, t] = 1
        df = df.drop('asset_type', axis=1)
    else:
        types = ['Housing / Retail', 'Light Industrial', 'Light Industrial / Office', 'Office', 'Retail']
        for t in types:
            df[t] = 0
            if df['asset_type']==t: df[t] = 1
        df = df.drop(labels=['asset_type'])

    return(df)

df = processing(df)

from sklearn.base import BaseEstimator, TransformerMixin

class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

date =  Pipeline([
                ('date', NumberSelector(key='date')),
                ('standard', StandardScaler())
            ])
surface =  Pipeline([
                ('surface', NumberSelector(key='surface')),
                ('standard', StandardScaler())
            ])
latitude =  Pipeline([
                ('latitude', NumberSelector(key='latitude')),
                ('standard', StandardScaler())
            ])
longitutde =  Pipeline([
                ('longitutde', NumberSelector(key='longitutde')),
                ('standard', StandardScaler())
            ])
asset_type_hr =  Pipeline([
                ('asset_type_hr', NumberSelector(key='Housing / Retail'))
            ])
asset_type_li =  Pipeline([
                ('asset_type_li', NumberSelector(key='Light Industrial'))
            ])
asset_type_lio =  Pipeline([
                ('asset_type_lio', NumberSelector(key='Light Industrial / Office'))
            ])
asset_type_o =  Pipeline([
                ('asset_type_o', NumberSelector(key='Office'))
            ])
asset_type_r =  Pipeline([
                ('asset_type_r', NumberSelector(key='Retail'))
            ])

from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('date', date),
                      ('surface', surface),
                      ('latitude', latitude),
                      ('longitutde', longitutde),
                      ('asset_type_hr', asset_type_hr),
                      ('asset_type_li', asset_type_li),
                      ('asset_type_lio', asset_type_lio),
                      ('asset_type_o', asset_type_o),
                      ('asset_type_r', asset_type_r),
                    ])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(df.drop('pricesqm', axis=1))

from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(n_neighbors=5)

model.fit(feature_processing.transform(df.drop('pricesqm', axis=1)), np.array(df['pricesqm']))


## predicting

file_path = os.path.join(data_path, "python_training.csv")
df = pd.read_csv(file_path, sep=';')
# filtering outliers (> 12ke per sqm)
df = df.loc[df['pricesqm']<12000]

file_path = os.path.join(data_path, "python_to_predict.csv")
X = pd.read_csv(file_path, sep=';')
X_ = processing(X)
X_ = feature_processing.transform(X_)

estimate = int(model.predict(X_))
kn = model.kneighbors(X_, n_neighbors=10, return_distance=True)

ids = [x for x in kn[1][0]]
distances = [x for x in kn[0][0]]
estimates = [float(df.iloc[int(x)]['pricesqm']) for x in ids]

ids.insert(0,int(X['id']))
distances.insert(0,np.mean(distances)) # average distance from 10 nearest --> confidence index
estimates.insert(0, estimate)

result = {'id': ids,
          'estimate': estimates,
          'distance': distances
         }
result = pd.DataFrame(data=result)

file_path = os.path.join(data_path, "python_predicted.csv")
# result.to_csv(file_path, sep=';', index = False)


NameError: name '__file__' is not defined

## training

In [429]:
df = pd.read_csv('../../db/python_training.csv', sep=';')
# filtering outliers (> 12ke per sqm)
df = df.loc[df['pricesqm']<12000]
df.head()

Unnamed: 0,id,date,asset_type,surface,latitude,longitutde,pricesqm
0,2,1000-01-01,Office,7288.0,48.792089,2.044248,1525.795829
1,3,1000-01-01,Office,1680.0,48.88335,2.263787,7440.47619
2,4,1000-01-01,Light Industrial,2022.0,48.532491,2.679602,667.655786
3,6,1000-01-01,Light Industrial,11378.0,48.957096,2.205695,729.47794
4,8,1000-01-01,Light Industrial,1615.0,48.950257,2.208497,773.993808


In [430]:
def days_from_present(x):
    measureDay = pd.Timestamp('20190101')
    td = pd.to_datetime(x,yearfirst=True) - measureDay
    return td.days

def processing(df):
    # handling dates
    df['date'] = '2000-01-01'
    if type(df) == type(pd.DataFrame()):
        df['date'] = df['date'].apply(days_from_present)
    else:
        df['date'] = days_from_present(df['date'])
        
    # encoding asset category column
    if type(df) == type(pd.DataFrame()):
        types = ['Housing / Retail', 'Light Industrial', 'Light Industrial / Office', 'Office', 'Retail']
        for t in types: 
            df[t] = 0
            df.loc[df['asset_type']==t, t] = 1
        df = df.drop('asset_type', axis=1)
    else: 
        types = ['Housing / Retail', 'Light Industrial', 'Light Industrial / Office', 'Office', 'Retail']
        for t in types: 
            df[t] = 0
            if df['asset_type']==t: df[t] = 1
        df = df.drop(labels=['asset_type'])
    
    return(df)

df = processing(df)

df.head()


Unnamed: 0,id,date,surface,latitude,longitutde,pricesqm,Housing / Retail,Light Industrial,Light Industrial / Office,Office,Retail
0,2,-6940,7288.0,48.792089,2.044248,1525.795829,0,0,0,1,0
1,3,-6940,1680.0,48.88335,2.263787,7440.47619,0,0,0,1,0
2,4,-6940,2022.0,48.532491,2.679602,667.655786,0,1,0,0,0
3,6,-6940,11378.0,48.957096,2.205695,729.47794,0,1,0,0,0
4,8,-6940,1615.0,48.950257,2.208497,773.993808,0,1,0,0,0


In [431]:
from sklearn.base import BaseEstimator, TransformerMixin
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [499]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

date =  Pipeline([
                ('date', NumberSelector(key='date')),
                ('standard', StandardScaler())
            ])
surface =  Pipeline([
                ('surface', NumberSelector(key='surface')),
                ('standard', StandardScaler())
            ])
latitude =  Pipeline([
                ('latitude', NumberSelector(key='latitude')),
                ('standard', StandardScaler())
            ])
longitutde =  Pipeline([
                ('longitutde', NumberSelector(key='longitutde')),
                ('standard', StandardScaler())
            ])
asset_type_hr =  Pipeline([
                ('asset_type_hr', NumberSelector(key='Housing / Retail'))
            ])
asset_type_li =  Pipeline([
                ('asset_type_li', NumberSelector(key='Light Industrial'))
            ])
asset_type_lio =  Pipeline([
                ('asset_type_lio', NumberSelector(key='Light Industrial / Office'))
            ])
asset_type_o =  Pipeline([
                ('asset_type_o', NumberSelector(key='Office'))
            ])
asset_type_r =  Pipeline([
                ('asset_type_r', NumberSelector(key='Retail'))
            ])

from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('date', date), 
                      ('surface', surface),
                      ('latitude', latitude),
                      ('longitutde', longitutde),
                      ('asset_type_hr', asset_type_hr),
                      ('asset_type_li', asset_type_li),
                      ('asset_type_lio', asset_type_lio),
                      ('asset_type_o', asset_type_o),
                      ('asset_type_r', asset_type_r),
                    ])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(df.drop('pricesqm', axis=1))

array([[ 0.        ,  0.9709117 , -0.14288451, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.        , -0.38975869,  0.1090155 , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.        , -0.30677915, -0.85943223, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        , -0.42615322, -0.626403  , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.65762247,  0.183394  , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.27814878, -0.59165324, ...,  0.        ,
         0.        ,  1.        ]])

In [530]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(n_neighbors=5)

model.fit(feature_processing.transform(df.drop('pricesqm', axis=1)), np.array(df['pricesqm']))

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

## predicting

In [534]:
df = pd.read_csv('../../db/python_training.csv', sep=';')
# filtering outliers (> 12ke per sqm)
df = df.loc[df['pricesqm']<12000]

X = pd.read_csv('../../db/python_to_predict.csv', sep=';')
X_ = processing(X)
X_ = feature_processing.transform(X_)
X_

array([[ 0.        , -0.45090151, -0.07722499,  0.18126974,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ]])

In [535]:
estimate = int(model.predict(X_))
kn = model.kneighbors(X_, n_neighbors=10, return_distance=True)

ids = [x for x in kn[1][0]]
distances = [x for x in kn[0][0]]
estimates = [float(df.iloc[int(x)]['pricesqm']) for x in ids]

ids.insert(0,int(X['id']))
distances.insert(0,np.mean(distances)) # average distance from 10 nearest --> confidence index
estimates.insert(0, estimate)

In [536]:
result = {'id': ids,
          'estimate': estimates,
          'distance': distances
         }
result = pd.DataFrame(data=result)
result

Unnamed: 0,id,estimate,distance
0,50,3020.0,1.054092
1,25,3501.40056,1.0
2,98,604.873803,1.022603
3,49,3829.432568,1.033038
4,76,4622.222222,1.052793
5,120,2545.986622,1.059563
6,114,5585.333333,1.066727
7,125,6036.217304,1.071088
8,81,3072.033898,1.07419
9,37,1614.048934,1.07799


In [537]:
result.to_csv('../../db/python_predicted.csv', sep=';', index = False)