In [1]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_squared_error 
from math import sqrt 

import joblib

pd.pandas.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('../data/houseprice.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# Split data into train and test

X_train, X_test, y_train, y_test = train_test_split(data, data.SalePrice, random_state=0, test_size=0.1)

print("X train shape", X_train.shape)
print("X test shape", X_test.shape)

X train shape (1314, 81)
X test shape (146, 81)


In [4]:
features = pd.read_csv('../data/selected_features.csv', index_col=0)
features = [f[0] for f in features.values] + ['LotFrontage']
print("Number of features selected: ", len(features))

Number of features selected:  23


In [5]:
features

['MSSubClass',
 'MSZoning',
 'Neighborhood',
 'OverallQual',
 'OverallCond',
 'YearRemodAdd',
 'RoofStyle',
 'MasVnrType',
 'BsmtQual',
 'BsmtExposure',
 'HeatingQC',
 'CentralAir',
 '1stFlrSF',
 'GrLivArea',
 'BsmtFullBath',
 'KitchenQual',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageCars',
 'PavedDrive',
 'LotFrontage']

In [6]:
# categorical variables
vars_with_na = [var for var in features if X_train[var].dtypes == 'O' and X_train[var].isnull().sum()>0]

In [7]:
for col in vars_with_na:
    print(col, " % missing: ", np.round(X_train[col].isnull().mean(), 3))

MasVnrType  % missing:  0.005
BsmtQual  % missing:  0.024
BsmtExposure  % missing:  0.025
FireplaceQu  % missing:  0.473
GarageType  % missing:  0.056
GarageFinish  % missing:  0.056


In [8]:
print(col, " % missing: ", np.round(X_train[col].isnull().mean(), 2))

GarageFinish  % missing:  0.06


In [9]:
def fill_categorical_na(df, var_list):

    X = df.copy()
    X[var_list] = X[var_list].fillna("missing")
    return X
X_train = fill_categorical_na(X_train, vars_with_na)
X_test = fill_categorical_na(X_test, vars_with_na)

In [10]:
X_train[vars_with_na].isnull().sum()

MasVnrType      0
BsmtQual        0
BsmtExposure    0
FireplaceQu     0
GarageType      0
GarageFinish    0
dtype: int64

In [11]:
# numerical variables with missing values

vars_with_na = [var for var in features if X_train[var].dtype != 'O' and X_train[var].isnull().sum()>0]
for var in vars_with_na:
    print(var, " % missing ", X_train[var].isnull().mean())

LotFrontage  % missing  0.17732115677321156


In [12]:
mode_var_dict = {}

for var in vars_with_na:

    mode = X_train[var].mode()[0]
    mode_var_dict[var] = mode 

    X_train[var] = X_train[var].fillna(mode)
    X_test[var] = X_test[var].fillna(mode)
np.save('../output/mode_var_dict.npy', mode_var_dict)

X_train[vars_with_na].isnull().sum()



LotFrontage    0
dtype: int64

In [13]:
mode_var_dict

{'LotFrontage': 60.0}

In [14]:
def elapsed_years(df, var):
    X = df.copy()
    X[var] = X['YrSold'] - X[var]
    return X 

In [15]:
X_train = elapsed_years(X_train, 'YearRemodAdd')
X_test = elapsed_years(X_test, 'YearRemodAdd')

In [17]:
# log transform some variables 



In [18]:
[f for f in features + ['SalePrice'] if X_train[f].dtype != 'O' and X_train.loc[X_train[f]<=0, f].sum()==0 and X_train[f].nunique()>20]

['1stFlrSF', 'GrLivArea', 'LotFrontage', 'SalePrice']

In [19]:
for var in ['1stFlrSF', 'GrLivArea', 'LotFrontage', 'SalePrice']:

    X_train[var] = np.log(X_train[var])
    X_test[var] = np.log(X_test[var])

In [20]:
def find_frequent_variables(df, var, rare_percent):

    df = df.copy()
    temp = df[var].value_counts(normalize = True)
    return temp[temp>rare_percent].index

In [21]:
cat_vars = [var for var in features if X_train[var].dtype == 'O']

In [22]:
frequent_var_dict = {}
for var in cat_vars:
    frequent_var_dict[var] = find_frequent_variables(X_train, var, 0.01)

    X_train[var] = np.where(X_train[var].isin(frequent_var_dict[var]), X_train[var], "Rare")
    X_test[var] = np.where(X_test[var].isin(frequent_var_dict[var]), X_test[var], "Rare")

np.save("../output/frequent_vars_dict.npy", frequent_var_dict)

In [23]:
frequent_var_dict

{'MSZoning': Index(['RL', 'RM', 'FV', 'RH'], dtype='object'),
 'Neighborhood': Index(['NAmes', 'CollgCr', 'OldTown', 'Edwards', 'Somerst', 'NridgHt',
        'Gilbert', 'Sawyer', 'NWAmes', 'BrkSide', 'SawyerW', 'Crawfor',
        'Mitchel', 'Timber', 'NoRidge', 'IDOTRR', 'ClearCr', 'SWISU', 'StoneBr',
        'Blmngtn', 'MeadowV', 'BrDale'],
       dtype='object'),
 'RoofStyle': Index(['Gable', 'Hip'], dtype='object'),
 'MasVnrType': Index(['None', 'BrkFace', 'Stone'], dtype='object'),
 'BsmtQual': Index(['TA', 'Gd', 'Ex', 'Fa', 'missing'], dtype='object'),
 'BsmtExposure': Index(['No', 'Av', 'Gd', 'Mn', 'missing'], dtype='object'),
 'HeatingQC': Index(['Ex', 'TA', 'Gd', 'Fa'], dtype='object'),
 'CentralAir': Index(['Y', 'N'], dtype='object'),
 'KitchenQual': Index(['TA', 'Gd', 'Ex', 'Fa'], dtype='object'),
 'FireplaceQu': Index(['missing', 'Gd', 'TA', 'Fa', 'Ex', 'Po'], dtype='object'),
 'GarageType': Index(['Attchd', 'Detchd', 'BuiltIn', 'missing', 'Basment'], dtype='object'),
 'Gara

In [24]:
def replace_categories(train, test, var, target):

    train = train.copy()
    test = test.copy()

    ordinal_labels = train.groupby(var)['SalePrice'].mean().sort_values().index
    ordinal_label = {k:i for i,k in enumerate(ordinal_labels)}

    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)

    return ordinal_label, train, test, 

    

In [25]:
ordinal_label_dict = {}

for var in cat_vars:

    ordinal_label, X_train, X_test = replace_categories(X_train, X_test, var, 'SalePrice')
    ordinal_label_dict[var] = ordinal_label

In [26]:
 
np.save('../output/ordinal_label_dict.npy', ordinal_label_dict)

In [27]:
# Feature Scaling

y_train = X_train["SalePrice"]
y_test = X_test['SalePrice']



In [28]:
scaler = MinMaxScaler()
scaler.fit(X_train[features])

joblib.dump(scaler, '../output/scaler.pkl')

['../output/scaler.pkl']

In [29]:
X_train = pd.DataFrame(scaler.transform(X_train[features]), columns = features)
X_test = pd.DataFrame(scaler.transform(X_test[features]), columns = features)

In [30]:
lin_model = Lasso(alpha=0.005, random_state=0)
lin_model.fit(X_train, y_train)


Lasso(alpha=0.005, random_state=0)

In [31]:
joblib.dump(lin_model, '../output/lasso_linmodel.pkl')


['../output/lasso_linmodel.pkl']

In [42]:
pred = lin_model.predict(X_train)
print(sum(pred))

print("Train rmse: {}".format(sqrt(mean_squared_error(np.exp(y_train), np.exp(pred)))))
print("Train mse: {}".format(mean_squared_error(np.exp(y_train), np.exp(pred))))

pred = lin_model.predict(X_test)
print(sum(pred))

print("Test rmse: {}".format(sqrt(mean_squared_error(np.exp(y_test), np.exp(pred)))))
print("Test mse: {}".format(mean_squared_error(np.exp(y_test), np.exp(pred))))



15799.641915788652
Train rmse: 32976.28565259362
Train mse: 1087435415.441452
1754.8343176222397
Test rmse: 37486.79170400692
Test mse: 1405259552.2596023


In [39]:
y_train.sum(axis=0)

15799.641915788645

In [38]:
X_test.sum(axis=0)

MSSubClass       31.205882
MSZoning        101.500000
Neighborhood     72.181818
OverallQual      81.777778
OverallCond      81.125000
YearRemodAdd     55.573770
RoofStyle        25.500000
MasVnrType       45.000000
BsmtQual         93.750000
BsmtExposure     60.000000
HeatingQC       113.750000
CentralAir      141.000000
1stFlrSF         64.694725
GrLivArea        73.343935
BsmtFullBath     22.666667
KitchenQual      75.333333
Fireplaces       28.666667
FireplaceQu      68.200000
GarageType       94.000000
GarageFinish     83.333333
GarageCars       65.750000
PavedDrive      141.500000
LotFrontage      62.160249
dtype: float64

In [40]:
y_test.sum(axis=0)

1755.4723998310549