In [None]:
%cd ../../

In [None]:
import os

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
pd.options.display.max_columns = 36

In [None]:
if not os.path.exists('data/house-prices/'):
    !make -j -y download_house_prices

In [None]:
data = pd.read_csv('data/house-prices/train.csv')
data.head()

In [None]:
data.describe()

In [None]:
data_features = data.drop(['SalePrice'], axis=1)
data_target = data['SalePrice']

In [None]:
categoric_features = data_features.dtypes[data.dtypes == 'object'].index
categories = data_features[categoric_features].apply(lambda x: list(set(x)))

In [None]:
numeric_features = data_features.dtypes[data.dtypes != 'object'].index

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_features, data_target, random_state=1)

## Making a pipeline

In [None]:
categorical_transformer = Pipeline([
    ('one-hot', OneHotEncoder(categories=list(categories))),
])

In [None]:
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler()),
])

In [None]:
preprocessor = ColumnTransformer(
    transformers= [
        ('cat', categorical_transformer, categoric_features),
        ('num', numerical_transformer, numeric_features)
    ]
)

In [None]:
def get_model_pipeline(model, *args, **kwargs):
    return Pipeline([
        ('preprocessing', preprocessor),
        ('model', model(*args, **kwargs))
    ])

## Trying different models

In [None]:
random_forest_regressor_pipeline = get_model_pipeline(RandomForestRegressor, random_state=1)
random_forest_regressor_pipeline.fit(X_train, y_train);
random_forest_regressor_pipeline.score(X_test, y_test)

In [None]:
knn_regressor_model = get_model_pipeline(KNeighborsRegressor, n_neighbors=13)
knn_regressor_model.fit(X_train, y_train);
knn_regressor_model.score(X_test, y_test)

In [None]:
linear_regression_pipeline = get_model_pipeline(LinearRegression)
linear_regression_pipeline.fit(X_train, y_train)
linear_regression_pipeline.score(X_test, y_test)