In [1]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

In [2]:
df = pd.read_csv("../data/rent_apartments.csv")

In [3]:
df.head()

Unnamed: 0,address,area,constraction_year,rooms,bedrooms,bathrooms,balcony,storage,parking,furnished,garage,garden,energy,facilities,zip,neighborhood,rent
0,1071 HN Amsterdam (Cornelis Schuytbuurt),167.0,1870,3,2,2,yes,no,no,yes,no,Not present,D,Roof terrace,1071 HN,Cornelis Schuytbuurt,4500
1,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,yes,no,yes,yes,no,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450
2,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,yes,no,yes,yes,no,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450
3,1071 WV Amsterdam (Hondecoeterbuurt),90.0,1923,3,2,1,yes,no,no,yes,no,Not present,,"Shower, Toilet",1071 WV,Hondecoeterbuurt,2000
4,1071 WV Amsterdam (Hondecoeterbuurt),104.0,1923,3,2,1,no,no,no,no,no,Present (47 m²),D,"Shower, Bath, Toilet",1071 WV,Hondecoeterbuurt,3250


In [4]:
df.dtypes

address               object
area                 float64
constraction_year      int64
rooms                  int64
bedrooms               int64
bathrooms              int64
balcony               object
storage               object
parking               object
furnished             object
garage                object
garden                object
energy                object
facilities            object
zip                   object
neighborhood          object
rent                   int64
dtype: object

In [5]:
df.isna().sum()

address                0
area                   0
constraction_year      0
rooms                  0
bedrooms               0
bathrooms              0
balcony                0
storage                0
parking                0
furnished              0
garage                 0
garden                 0
energy               624
facilities           420
zip                    0
neighborhood           0
rent                   0
dtype: int64

In [6]:
data_encoded = pd.get_dummies(df, 
                              columns = ['balcony',
                                         'parking', 
                                         'furnished', 
                                         'garage', 
                                         'storage'], 
                              drop_first=True)

In [7]:
data_encoded.head()

Unnamed: 0,address,area,constraction_year,rooms,bedrooms,bathrooms,garden,energy,facilities,zip,neighborhood,rent,balcony_yes,parking_yes,furnished_yes,garage_yes,storage_yes
0,1071 HN Amsterdam (Cornelis Schuytbuurt),167.0,1870,3,2,2,Not present,D,Roof terrace,1071 HN,Cornelis Schuytbuurt,4500,True,False,True,False,False
1,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450,True,True,True,False,False
2,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450,True,True,True,False,False
3,1071 WV Amsterdam (Hondecoeterbuurt),90.0,1923,3,2,1,Not present,,"Shower, Toilet",1071 WV,Hondecoeterbuurt,2000,True,False,True,False,False
4,1071 WV Amsterdam (Hondecoeterbuurt),104.0,1923,3,2,1,Present (47 m²),D,"Shower, Bath, Toilet",1071 WV,Hondecoeterbuurt,3250,False,False,False,False,False


In [8]:
data_encoded['garden'] = data_encoded['garden'].apply(
    lambda x: 0 if x == 'Not present' else int(re.findall(r'\d+', x)[0])
)

In [9]:
data_encoded["garden"].unique()

array([  0,  47,  29,  75,  40,  50,  20,   1,  15,  25,  12,  45,  26,
        42,  46,  60,  16,  65,  90,  85, 500,  30,  49,  51,  80,  27,
        56,   9, 200,  32, 100,  34])

In [10]:
X = data_encoded[['area', 
                  'constraction_year', 
                  'bedrooms', 
                  'garden', 
                  'balcony_yes', 
                  'parking_yes', 
                  'furnished_yes', 
                  'garage_yes', 
                  'storage_yes']]
y = data_encoded['rent']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
rf = RandomForestRegressor()

In [13]:
rf.fit(X_train, y_train)

In [14]:
rf.score(X_test, y_test)

0.7928497572438217