In [1]:
# Import packages for later use
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as st
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
# define johnson transformation
def johnson(y):
    gamma, eta, epsilon, lbda = st.johnsonsu.fit(y)
    yt = gamma + eta * np.arcsinh((y-epsilon)/lbda)
    return yt, gamma, eta, epsilon, lbda

def johnson_inverse(y, gamma, eta, epsilon, lbda):
    return lbda*np.sinh((y-gamma)/eta) + epsilon

In [15]:
# Load data
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

numerical_features = train_data.dtypes[train_data.dtypes != 'object'].index.values
categorical_features = train_data.dtypes[train_data.dtypes == 'object'].index.values
omit = ['SalePrice', 'Id', 'Training']
submit = ['SalePrice', 'Id']

In [4]:
# preprocess

# drop one row in training
train_data.dropna(subset=['Electrical'], inplace=True)

# concat
train_data['Training'] = 1
test_data['Training'] = 0
all_data = pd.concat([train_data, test_data], ignore_index=True)

# fill categoricals
categoricals = all_data[categorical_features]
categoricals.fillna('None', inplace=True)
all_data[categorical_features] = categoricals

# fill numericals with 0
all_data.fillna(0, inplace=True)

# process categoricals
all_data = pd.get_dummies(data=all_data)

train_set = all_data.loc[all_data['Training'] == 1]
test_set = all_data.loc[all_data['Training'] == 0]


In [6]:
# obtain X & y
y = train_set['SalePrice']
y_log = np.log(y)
y_j, gamma, eta, epsilon, lbda = johnson(y)

X = train_set[[c for c in train_set.columns if c not in omit]]
X_test = test_set[[c for c in test_set.columns if c not in omit]]

In [20]:
# create model for log transformed data
model_log = LinearRegression().fit(X, y_log)

y_log_pred = np.exp(model_log.predict(X_test))
test_data['SalePrice'] = y_log_pred
submission = test_data[[c for c in test_data.columns if c in submit]]
submission.to_csv('./data/teamJarvis_log.csv', index=False)

In [21]:
# create model for johnson transformed data
model_j = LinearRegression().fit(X, y_j)

y_j_pred = johnson_inverse(model_j.predict(X_test), gamma, eta, epsilon, lbda)
test_data['SalePrice'] = y_j_pred
submission = test_data[[c for c in test_data.columns if c in submit]]
submission.to_csv('./data/teamJarvis_johnson.csv', index=False)