In [82]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re

In [83]:
dataOrg = pd.read_csv('Apartments_koopen.csv')

In [84]:
columnsToSelect = ['app', 'area', 'houseTypeValue', 'lotSize', 'price']
data = dataOrg.loc[:, columnsToSelect]

In [85]:
# Housenumber is not relevant, but street name and zip code might be relevant for our model:
import ast
def extract_street(location):
    street = ast.literal_eval(location)['street']
    return street

def extract_zipcode(location):
    zipcode = ast.literal_eval(location)['zipcode']
    return zipcode

# lotSize column containts 'Onbekend', replace them with 0, and convert other data to integer values:
def correct_lotSize(x):
    if x == "Onbekend":
           x = int(0)
    else:
        x = int(x)
    return x

In [86]:
data.loc[:, 'street'] = dataOrg.location.apply(extract_street)
data.loc[:, 'zipcode'] = dataOrg.location.apply(extract_zipcode)
data.loc[:, 'lotSize'] = dataOrg.lotSize.apply(correct_lotSize)

# We're interested in price per squared meter:
data.loc[:, 'PricePA'] = data.price/data.area

# Street is already contained in zipcode. We do not want to introduce collinearity:
#data = data.drop(columns = ['street', 'area', 'price'])



In [87]:
data.to_csv('HousingAmsterdam.csv', sep = ',')

In [88]:
data.head()

Unnamed: 0,app,area,houseTypeValue,lotSize,price,street,zipcode,PricePA
0,False,212,Herenhuis,91,1049000,Scheepstimmermanstraat 6,1019WX,4948.113208
1,False,302,Vrijstaande woning,1470,1395000,Zuideinde 286,1035PM,4619.205298
2,False,103,Eengezinswoning,265,700000,Durgerdammerdijk 119,1026CG,6796.116505
3,False,150,Tussenwoning,167,539080,Diopter 80,1025MS,3593.866667
4,False,285,Tussenwoning,90,1650000,Eerste Helmersstraat 201,1054DV,5789.473684


In [89]:
# Check how many properties there are per zip code:
t = data.pivot_table(index = 'zipcode', values = 'PricePA', aggfunc = 'count')
t.columns = ['total_count']
t

Unnamed: 0_level_0,total_count
zipcode,Unnamed: 1_level_1
1012EX,86
1013JK,86
1013ZM,86
1015HG,86
1019WX,86
1025MS,86
1026CG,86
1028AZ,86
1033KS,86
1034SN,86


We can see that there are, in all cases, exactly 89 properties per zipcode (as Rafael said). This is very unusual, the possibility that the properties are so evenly distributed across zipcodes is very unlikely. I have to admit that I do not have any idea on why this is so, but I think it is important to think about it. I'd suggest we re-trace our steps, and see whether we did something wrong in the process of gathering the data.

# Linear Model

## Simple Linear Regression

In [90]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [91]:
# make copy, cos' you never know:
df = data.copy()

In [92]:
# Make dummies:
df = pd.get_dummies(df, columns = ['app', 'houseTypeValue', 'zipcode'])

In [93]:
# Avoid dummy variable trap:
df.drop(columns = ['app_False', 'houseTypeValue_Penthouse', 'zipcode_1068MS'], inplace = True)

In [94]:
df.head()

Unnamed: 0,area,lotSize,price,street,PricePA,app_True,houseTypeValue_Appartement,houseTypeValue_Bovenwoning,houseTypeValue_Bungalow,houseTypeValue_Eengezinswoning,...,zipcode_1067SJ,zipcode_1068MZ,zipcode_1071BE,zipcode_1072BK,zipcode_1072ER,zipcode_1083TP,zipcode_1087BB,zipcode_1091BD,zipcode_1091CZ,zipcode_1098NA
0,212,91,1049000,Scheepstimmermanstraat 6,4948.113208,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,302,1470,1395000,Zuideinde 286,4619.205298,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,103,265,700000,Durgerdammerdijk 119,6796.116505,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,150,167,539080,Diopter 80,3593.866667,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,285,90,1650000,Eerste Helmersstraat 201,5789.473684,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [95]:
# Create model matrix and response variable:
X = df.drop(columns = ['PricePA']).copy()
y = df.PricePA.copy()

In [96]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.7)

In [97]:
# Building a regressor
regressor = linear_model.LinearRegression()

In [98]:
# Fitting the model
model = regressor.fit(X_train, y_train)

ValueError: could not convert string to float: 'Zuideinde 286'

In [None]:
# Making prediction:
y_pred = model.predict(X_test)

In [None]:
pd.DataFrame(data = {'pred':y_pred, 'test':y_test}).head()

This accuracy is extremely weird. The model is impossibly accurate. Next, I will try lasso and see how it compares to this.

## Lasso Regression with cross-validation

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [None]:
regressor2 = linear_model.LassoCV(alphas = np.arange(0.01, 10, 0.1), normalize = True, n_jobs = -1, max_iter=10000, cv=10)

In [None]:
modelLasso = regressor2.fit(X_train, y_train)

In [None]:
y_pred_lasso = modelLasso.predict(X_test)

In [None]:
np.sum(y_pred_lasso-y_test)

In [None]:
plt.scatter(y_pred_lasso, y_test)
plt.xlabel('y predicted')
plt.ylabel('y test')
plt.show()
print('R^2 of the method is {0:.2f}'.format(r2_score(y_test, y_pred_lasso)))

In [None]:
pd.DataFrame({'variables':X.columns, 'coeffitients':modelLasso.coef_})