# Data Preprocessing

In [628]:
import pandas as pd

In [629]:
training_data = pd.DataFrame.from_csv('train.csv')
test_data = pd.DataFrame.from_csv('test.csv')
df = pd.concat([training_data, test_data])
training_starting_idx = 0
training_ending_idx = len(training_data)

In [630]:
target = 'SalePrice'
features = [feature for feature in df.columns if feature != target]

Handling missing values

In [631]:
missing_values_stats = df.isnull().sum()

In [632]:
missing_values_stats[missing_values_stats!=0]

Alley           2721
BsmtCond          82
BsmtExposure      82
BsmtFinSF1         1
BsmtFinSF2         1
BsmtFinType1      79
BsmtFinType2      80
BsmtFullBath       2
BsmtHalfBath       2
BsmtQual          81
BsmtUnfSF          1
Electrical         1
Exterior1st        1
Exterior2nd        1
Fence           2348
FireplaceQu     1420
Functional         2
GarageArea         1
GarageCars         1
GarageCond       159
GarageFinish     159
GarageQual       159
GarageType       157
GarageYrBlt      159
KitchenQual        1
LotFrontage      486
MSZoning           4
MasVnrArea        23
MasVnrType        24
MiscFeature     2814
PoolQC          2909
SalePrice       1459
SaleType           1
TotalBsmtSF        1
Utilities          2
dtype: int64

In [633]:
drop_columns = [col for col in features if df[col].isnull().sum() > 1000]

In [634]:
df.drop(drop_columns, 1, inplace=True)

In [635]:
features = list(filter(lambda x: x not in drop_columns, features))

In [636]:
numerical_features = []
categorical_features = []
for var in features:
    if 1. * df[training_starting_idx: training_ending_idx][var].nunique() / df[training_starting_idx: training_ending_idx][var].count() < 0.05:
        categorical_features.append(var)
        df[var] = df[var].astype('category')
    else:
        numerical_features.append(var)
        df[var] = df[var].astype(float)

In [637]:
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df[training_starting_idx: training_ending_idx][numerical_features])
df[numerical_features] = imr.transform(df[numerical_features].values)

In [638]:
for var in categorical_features:
    df[var] = df[var].fillna(df[training_starting_idx: training_ending_idx][var].value_counts().idxmax())

In [639]:
assert df[features].isnull().sum().sum() == 0

Handling categorical variables

In [640]:
df = pd.get_dummies(df)

Handling skewness

In [641]:
from scipy.stats import skew

In [642]:
skewed_feats = df[numerical_features].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

In [644]:
df[skewed_feats] = np.log1p(df[skewed_feats])

Scaling numerical features

In [645]:
from sklearn.preprocessing import StandardScaler

In [646]:
stdsc = StandardScaler()

In [647]:
stdsc.fit(df[training_starting_idx: training_ending_idx][numerical_features])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [648]:
df[numerical_features] = stdsc.transform(df[numerical_features])

Fit a Lasso model

In [651]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score

In [652]:
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [653]:
training_features = list(filter(lambda x: x != 'SalePrice', df.columns))

In [654]:
train_X = df[training_starting_idx: training_ending_idx][training_features]
train_Y = np.log1p(df[training_starting_idx: training_ending_idx]['SalePrice'])

In [655]:
model_lasso = LassoCV(alphas = [100, 10, 1, 0.1, 0.001, 0.0005], max_iter=100000).fit(train_X, train_Y)

In [658]:
test_X = df[training_ending_idx:][training_features]

In [659]:
predictions = np.exp(model_lasso.predict(test_X))-1

In [660]:
predictions = pd.Series(predictions, index=range(training_ending_idx+1, training_ending_idx+1+len(test_X)))

In [661]:
predictions.to_csv('better_lasso.csv')