In [None]:
import sys
relative_root = "../.."
sys.path.append(relative_root)

In [None]:
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json
from examples.training import house_prices_config
from scipy.stats import skew
from pathlib import Path

In [None]:
train = pd.read_csv(Path(relative_root) / house_prices_config.LOCAL_RAW_TRAIN_FILENAME)

In [None]:
#log transform the target:
train["SalePrice"] = np.log1p(train["SalePrice"])

#log transform skewed numeric features:
numeric_feats = train.dtypes[train.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

train[skewed_feats] = np.log1p(train[skewed_feats])

In [None]:
train = pd.get_dummies(train)

In [None]:
#filling NA's with the mean of the column:
train = train.fillna(train.mean())

In [None]:
#creating matrices for sklearn:
features = [col for col in train.columns if col != "SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(
    train[features],
    train["SalePrice"],
    test_size=0.33,
    random_state=88,
)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [None]:
model_ridge = Ridge(alpha = 10, random_state=88).fit(X_train, y_train)

In [None]:
predictions = model_ridge.predict(X_test)

In [None]:
mean_squared_error(predictions, y_test, squared=False)

## Init trubrics context

In [None]:
from trubrics.context import DataContext

In [None]:
data_context = DataContext(
    name="house_prices_dataset",
    version=0.1,
    training_data=X_train.assign(SalePrice=y_train),
    testing_data=X_test.assign(SalePrice=y_test),
    minimum_functionality_data=X_test.assign(SalePrice=y_test).head(),
    target="SalePrice"
)

## Init trubrics validator

In [None]:
from trubrics.validations import ModelValidator

In [None]:
model_validator = ModelValidator(data=data_context, model=model_ridge)

In [None]:
model_validator.validate_performance_against_threshold(metric="neg_root_mean_squared_error", threshold=-0.15).dict()

In [None]:
model_validator.validate_minimum_functionality_in_range(range_value=0.1, range_inclusive=True)