In [1]:
import pandas as pd
import pickle
import numpy as np

In [2]:
df_brandenburg = pd.read_csv('../Data/brandenburg_cleaned.csv')
df_brandenburg.head()

Unnamed: 0,land,city,balcony,living_space,total_rent,service_charge,base_rent,lift,pets_allowed,floor,no_rooms,garden,cellar,noParkSpaces
0,Brandenburg,Oder_Spree_Kreis,True,36.29,335.0,50.0,285.0,True,negotiable,1.0,1.0,False,True,0.0
1,Brandenburg,Märkisch_Oderland_Kreis,True,60.71,365.0,65.0,300.0,False,no,0.0,2.0,False,True,0.0
2,Brandenburg,Brandenburg_an_der_Havel,True,67.8,590.0,170.0,420.0,True,yes,0.0,3.0,False,True,0.0
3,Brandenburg,Dahme_Spreewald_Kreis,True,69.0,520.0,150.0,370.0,False,negotiable,2.0,2.0,True,True,1.0
4,Brandenburg,Uckermark_Kreis,True,57.38,378.71,71.73,306.98,False,negotiable,4.0,3.0,False,True,0.0


the factors that most positively influence the rent amount (correlation> = 0.3) are: living space, no_rooms,noParkSpaces are clearly correlated

In [3]:
brb_df=df_brandenburg[['city','total_rent', 'living_space','no_rooms','noParkSpaces']].copy()

## Processing Data

In [4]:
brb_df.head()

Unnamed: 0,city,total_rent,living_space,no_rooms,noParkSpaces
0,Oder_Spree_Kreis,335.0,36.29,1.0,0.0
1,Märkisch_Oderland_Kreis,365.0,60.71,2.0,0.0
2,Brandenburg_an_der_Havel,590.0,67.8,3.0,0.0
3,Dahme_Spreewald_Kreis,520.0,69.0,2.0,1.0
4,Uckermark_Kreis,378.71,57.38,3.0,0.0


In [5]:
# X-y split
X=brb_df.drop('total_rent', axis=1)
y=brb_df.total_rent

In [6]:
#train-test split
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test=tts(X, y, test_size=.2)

In [7]:
# first numerical and categorical split

X_train_num=X_train._get_numeric_data()
X_train_cat=X_train.select_dtypes('object')
X_test_num=X_test._get_numeric_data()
X_test_cat=X_test.select_dtypes('object')


In [8]:
#Scaling numerical datafrom sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train_num)

X_train_scaled_np = scaler.transform(X_train_num)
X_test_scaled_np  = scaler.transform(X_test_num)

X_train_num_scaled_df = pd.DataFrame(X_train_scaled_np, columns=X_train_num.columns)
X_test_num_scaled_df  = pd.DataFrame(X_test_scaled_np, columns=X_test_num.columns)

In [9]:
# Transforming categorical data
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first').fit(X_train_cat)

X_train_cat = encoder.transform(X_train_cat).toarray()
X_test_cat = encoder.transform(X_test_cat).toarray()

X_train_cat = pd.DataFrame(X_train_cat, columns = encoder.get_feature_names_out())
X_test_cat = pd.DataFrame(X_test_cat, columns = encoder.get_feature_names_out())

In [10]:
# joining numerical-categorical
X_train_f=pd.concat([X_train_num_scaled_df, X_train_cat], axis=1)
X_test_f=pd.concat([X_test_num_scaled_df, X_test_cat], axis=1)

##### Store scalers

In [13]:
#with open("../Scalers/standard_scaler.pkl", "wb") as file:
#    pickle.dump(scaler, file, pickle.HIGHEST_PROTOCOL)

#### Store Transformer

In [15]:
#with open("../Transformers/one_hot_encoder.pkl", "wb") as file:
#    pickle.dump(encoder, file, pickle.HIGHEST_PROTOCOL)

### Linear Regression

In [14]:
#Apply linear regression
from sklearn.linear_model import LinearRegression as LinReg

linreg=LinReg()    # model
linreg.fit(X_train_f, y_train)   # model train
y_test_pred_linreg=linreg.predict(X_test_f)   # model prediction
y_train_pred_linreg=linreg.predict(X_train_f)   # model prediction

In [15]:
#stoing linear
#with open("../Models/Linear.pkl", "wb") as file:
 #   pickle.dump(linreg, file, pickle.HIGHEST_PROTOCOL)

### K-NN

In [16]:
# help(KNeighborsRegressor)

In [17]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=15,weights='distance')
regressor.fit(X_train_f, y_train)

In [18]:
y_test_pred_knn = regressor.predict(X_test_f) # model prediction
y_train_pred_knn = regressor.predict(X_train_f) # model prediction

In [19]:
#stoing linear
#with open("../Models/knn.pkl", "wb") as file:
#    pickle.dump(regressor, file, pickle.HIGHEST_PROTOCOL)

## Model Validation - Comparing Linear and KNN

In [23]:
print ('Linear: train R2: {} -- test R2: {}'.format(linreg.score(X_train_f, y_train),
                                            linreg.score(X_test_f, y_test)))
print ('KNN: train R2: {} -- test R2: {}'.format(regressor.score(X_train_f, y_train),
                                            regressor.score(X_test_f, y_test)))

Linear: train R2: 0.702711868203153 -- test R2: 0.6711008808731851
KNN: train R2: 0.9826771109528131 -- test R2: 0.6718832275006406


In [24]:
from sklearn.metrics import mean_squared_error as mse

train_mse=mse(linreg.predict(X_train_f), y_train)
test_mse=mse(linreg.predict(X_test_f), y_test)

print ('Liniar : train MSE: {} -- test MSE: {}'.format(train_mse, test_mse))
print ('Liniar:train RMSE: {} -- test RMSE: {}'.format(train_mse**.5, test_mse**.5))

train_mse=mse(regressor.predict(X_train_f), y_train)
test_mse=mse(regressor.predict(X_test_f), y_test)

print ('KNN : train MSE: {} -- test MSE: {}'.format(train_mse, test_mse))
print ('KNN: train RMSE: {} -- test RMSE: {}'.format(train_mse**.5, test_mse**.5))

Liniar : train MSE: 12401.779957626051 -- test MSE: 14106.69878442619
Liniar:train RMSE: 111.3632792154849 -- test RMSE: 118.77162449182123
KNN : train MSE: 722.6479472796158 -- test MSE: 14073.143424813718
KNN: train RMSE: 26.882112031602276 -- test RMSE: 118.63028038748672


In [25]:
from sklearn.metrics import mean_absolute_error as mae

train_mae=mae(linreg.predict(X_train_f), y_train)
test_mae=mae(linreg.predict(X_test_f), y_test)

print ('Linear: train MAE: {} -- test MAE: {}'.format(train_mae, test_mae))

train_mae=mae(regressor.predict(X_train_f), y_train)
test_mae=mae(regressor.predict(X_test_f), y_test)

print ('KNN: train MAE: {} -- test MAE: {}'.format(train_mae, test_mae))

Linear: train MAE: 81.57288338180335 -- test MAE: 86.01133954172518
KNN: train MAE: 8.512774966975657 -- test MAE: 80.77195489568828


### Comparing KNN and Linear regression models in order to predict rent prices in Brandenburg.

When we look at R2 score both model has almost same preformance on the test set. However Linier has a bit high 'mae' performance.