In [1]:
import numpy as np
import pandas as pd
from itertools import product
import pickle

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [2]:
# lese inn data
data = pd.read_csv('data/huspris.csv')

In [3]:
# velge ut features
numeric_features = ['LotFrontage', 'LotArea', 'OverallQual', 'YrSold']
categorical_features = ['Street', 'HouseStyle', 'BsmtQual', 'GarageCond']

In [4]:
# del data i X og y
X = data.loc[:, numeric_features + categorical_features]
y = data.SalePrice

In [5]:
# dele data i trenings, validerings og testdata
X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, test_size=0.5, random_state=42)

In [6]:
# imputers
imputers = {
    'median': SimpleImputer(strategy='median'), 
    'knn': KNNImputer(n_neighbors=10)
}

In [7]:
# modeller
models = {'lr': LinearRegression(),
          'lasso': Lasso(alpha=1),
          'rf': RandomForestRegressor(), 
          'svm': SVR()}   

In [8]:
# kombiner
pipes = {imputer_key + '_' + regressor_key: Pipeline(
    steps=[('preprocess', ColumnTransformer(transformers=[
        ('num', Pipeline(steps = [('impute', imputer), ('scaler', StandardScaler())]), numeric_features),
        ('cat', Pipeline(steps = [('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))]), categorical_features)])),
           ('regress', regressor)])
         for (imputer_key, imputer), (regressor_key, regressor) in product(imputers.items(), models.items())}

In [9]:
# modelutvalg
validation_rmse = pd.DataFrame(
    {key: mean_squared_error(y_val, pipe.fit(X_train, y_train).predict(X_val), squared=False)
     for key, pipe in pipes.items()}.items(),
    columns=['model', 'rmse'])
best_model = pipes[validation_rmse.loc[np.argmin(validation_rmse.rmse), 'model']]

In [10]:
# modell og generaliseringsfeil
print('Test RMSE:', mean_squared_error(y_test, best_model.predict(X_test), squared=False))
print(best_model)

Test RMSE: 42621.51312277594
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['LotFrontage', 'LotArea',
                                                   'OverallQual', 'YrSold']),
                                                 ('cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                

In [11]:
# lagre model
pickle.dump(best_model, open('model.pkl', 'wb'))