In [35]:
import pandas as pd
import pickle

In [2]:
df_brandenburg = pd.read_csv('../Data/brandenburg_cleaned.csv')
df_brandenburg.head()

Unnamed: 0.1,Unnamed: 0,land,city,balcony,living_space,total_rent,service_charge,base_rent,lift,pets_allowed,floor,no_rooms,garden,cellar,noParkSpaces
0,0,Brandenburg,Oder_Spree_Kreis,True,36.29,335.0,50.0,285.0,True,negotiable,1.0,1.0,False,True,0.0
1,1,Brandenburg,Märkisch_Oderland_Kreis,True,60.71,365.0,65.0,300.0,False,no,0.0,2.0,False,True,0.0
2,2,Brandenburg,Brandenburg_an_der_Havel,True,67.8,590.0,170.0,420.0,True,yes,0.0,3.0,False,True,0.0
3,4,Brandenburg,Dahme_Spreewald_Kreis,True,69.0,520.0,150.0,370.0,False,negotiable,2.0,2.0,True,True,1.0
4,5,Brandenburg,Uckermark_Kreis,True,57.38,378.71,71.73,306.98,False,negotiable,4.0,3.0,False,True,0.0


the factors that most positively influence the rent amount (correlation> = 0.3) are: living space, no_rooms,noParkSpaces are clearly correlated

In [3]:
brb_df=df_brandenburg[['total_rent', 'living_space','no_rooms','noParkSpaces']].copy()

## Processing Data

In [4]:
brb_df.head()

Unnamed: 0,total_rent,living_space,no_rooms,noParkSpaces
0,335.0,36.29,1.0,0.0
1,365.0,60.71,2.0,0.0
2,590.0,67.8,3.0,0.0
3,520.0,69.0,2.0,1.0
4,378.71,57.38,3.0,0.0


In [5]:
# X-y split
X=brb_df.drop('total_rent', axis=1)
y=brb_df.total_rent

In [6]:
#train-test split
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test=tts(X, y, test_size=.2)

In [7]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaler.fit(X_train)

X_train_scaled_np = scaler.transform(X_train)
X_test_scaled_np  = scaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled_np, columns=X.columns, index=X_train.index)
X_test_scaled_df  = pd.DataFrame(X_test_scaled_np, columns=X.columns, index=X_test.index)

##### Store scalers

In [36]:
with open("../Scalers/standard_scaler.pkl", "wb") as file:
    pickle.dump(scaler, file, pickle.HIGHEST_PROTOCOL)

### Linear Regression

In [39]:
#Apply linear regression
from sklearn.linear_model import LinearRegression as LinReg

linreg=LinReg()    # model
linreg.fit(X_train_scaled_df, y_train)   # model train
y_test_pred_linreg=linreg.predict(X_test_scaled_df)   # model prediction
y_train_pred_linreg=linreg.predict(X_train_scaled_df)   # model prediction

In [40]:
#stoing linear
with open("../Models/Linear.pkl", "wb") as file:
    pickle.dump(linreg, file, pickle.HIGHEST_PROTOCOL)

### K-NN

In [34]:
# help(KNeighborsRegressor)

In [28]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=15,weights='distance')
regressor.fit(X_train_scaled_df, y_train)

In [41]:
y_test_pred_knn = regressor.predict(X_test) # model prediction
y_train_pred_knn = regressor.predict(X_train) # model prediction

In [42]:
#stoing linear
with open("../Models/knn.pkl", "wb") as file:
    pickle.dump(regressor, file, pickle.HIGHEST_PROTOCOL)

## Model Validation - Comparing Linear and KNN

In [30]:
print ('Linear: train R2: {} -- test R2: {}'.format(linreg.score(X_train_scaled_df, y_train),
                                            linreg.score(X_test_scaled_df, y_test)))
print ('KNN: train R2: {} -- test R2: {}'.format(regressor.score(X_train_scaled_df, y_train),
                                            regressor.score(X_test_scaled_df, y_test)))

Linear: train R2: 0.5564988079299346 -- test R2: 0.5217731089432932
KNN: train R2: 0.8692766861492459 -- test R2: 0.5154401036567353


In [17]:
from sklearn.metrics import mean_squared_error as mse

train_mse=mse(linreg.predict(X_train_scaled_df), y_train)
test_mse=mse(linreg.predict(X_test_scaled_df), y_test)

print ('Liniar : train MSE: {} -- test MSE: {}'.format(train_mse, test_mse))
print ('Liniar:train RMSE: {} -- test RMSE: {}'.format(train_mse**.5, test_mse**.5))

train_mse=mse(regressor.predict(X_train_scaled_df), y_train)
test_mse=mse(regressor.predict(X_test_scaled_df), y_test)

print ('KNN : train MSE: {} -- test MSE: {}'.format(train_mse, test_mse))
print ('KNN: train RMSE: {} -- test RMSE: {}'.format(train_mse**.5, test_mse**.5))

Liniar : train MSE: 18922.5404091589 -- test MSE: 18750.833639008306
Liniar:train RMSE: 137.55922509653396 -- test RMSE: 136.93368336172188
KNN : train MSE: 6060.759870569123 -- test MSE: 20908.01472855156
KNN: train RMSE: 77.85088227225896 -- test RMSE: 144.59603980936532


In [18]:
from sklearn.metrics import mean_absolute_error as mae

train_mae=mae(linreg.predict(X_train_scaled_df), y_train)
test_mae=mae(linreg.predict(X_test_scaled_df), y_test)

print ('Linear: train MAE: {} -- test MAE: {}'.format(train_mse, test_mse))

train_mae=mae(regressor.predict(X_train_scaled_df), y_train)
test_mae=mae(regressor.predict(X_test_scaled_df), y_test)

print ('KNN: train MAE: {} -- test MAE: {}'.format(train_mse, test_mse))

Linear: train MAE: 6060.759870569123 -- test MAE: 20908.01472855156
KNN: train MAE: 6060.759870569123 -- test MAE: 20908.01472855156


### Comparing KNN and Linear regression models in order to predict rent prices in Brandeng.

It seems from error metrics that Linier model has better error metrics than KNN model.