**We fetch the data**

We select one city to start with

In [204]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
data = pd.read_csv('listings.csv.gz', nrows=13815, compression='gzip')



data = data[['accommodates','bedrooms', 'beds', 'review_scores_rating','review_scores_location','review_scores_value','property_type','room_type','price']]
data = data.dropna()
data.shape

(11817, 9)

**We define a function to change string features to an integer**

In [205]:
def string_features_to_num(array):
    dictOfWords = { i : np.unique(array)[i] for i in range(0, len(np.unique(array)) ) }
    dictOfWords = {v: k for k, v in dictOfWords.items()}

    values = np.zeros(len(array))
    for i in range(len(array)):
        values[i] = dictOfWords.get(array[i])
        
    return values

**Data split into features and target**

We also change the target from a string to a float. 

In [206]:
import re

datanp = np.array(data)
X = datanp[:,:-1]
y = datanp[:,-1]

for i in range(len(y)):
    y[i] = float(re.sub(",", "", (y[i][1:])))

X[:, -2] = string_features_to_num(X[:, -2])
X[:, -1] = string_features_to_num(X[:, -1])

**Data split into train/val/test**

In [207]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42)

**Models imported**

In [208]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

linear_model = LinearRegression()
svr_model = SVR(C=60, epsilon=0.95)

**Models fit and scored**

We fit the models on the training set and test with the linear regression. 

We perform a small gridsearch for the C and epsilon parameters for the SVR

In [209]:
linear_model.fit(X_train, y_train)
linear_train_score = linear_model.score(X_train, y_train)
linear_val_score = linear_model.score(X_val,y_val)
linear_test_score = linear_model.score(X_test,y_test)

svr_model.fit(X_train, y_train)
svr_train_score = svr_model.score(X_train, y_train)
svr_test_score = svr_model.score(X_test,y_test)


print(linear_train_score, linear_test_score, linear_val_score)

0.1000303501137122 0.11155346505636221 0.05469071823857308
