<h2>Import Library</h2>

In [1]:
import csv
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.stats import norm, skew
from scipy.special import boxcox1p
from sklearn import linear_model
from sklearn.metrics import mean_squared_error


<h2>Add data</h2>

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train_ID = train['Id']
test_ID = test['Id']
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)


In [3]:
#Testing Testing 123
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
train["SalePrice"] = np.log1p(train["SalePrice"])

ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))

all_data size is : (2917, 79)


In [None]:
train.shape

In [None]:
train.head()

<h2>Data Cleaning</h2>

In [None]:
missing_data = (train.isnull().sum()/len(train))
missing_data = missing_data.drop(missing_data[missing_data==0].index).sort_values(ascending=False)
print(missing_data)

<h3>NA handling</h4>

In [None]:
fill_nones = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu','GarageYrBlt', 
              'GarageType', 'GarageFinish','GarageQual', 'GarageCond', 'BsmtFinType2', 
              'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual','MasVnrType']
for fill_none in fill_nones:
    train[fill_none]=train[fill_none].fillna('None')
train['LotFrontage'] = train.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
train = train.dropna(axis=0, how='any')

In [None]:
missing_data = (train.isnull().sum()/len(train))
missing_data = missing_data.drop(missing_data[missing_data==0].index).sort_values(ascending=False)
print(missing_data)

<h3>Categorical Data handling</h3>

In [None]:
'''
num_to_str = ["MSSubClass","MSZoning", "BsmtFullBath","BsmtHalfBath",
              "HalfBath","BedroomAbvGr","KitchenAbvGr", "Utilities", "LotConfig",
             "Neighborhood","Condition1", "Condition2", "BldgType", "HouseStyle",
             "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType",
             "Foundation", "Heating", "Electrical", 'GarageYrBlt']
for col in num_to_str:
    train[col]=train[col].astype(str)
'''

In [4]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio
PoolQC,99.691464
MiscFeature,96.400411
Alley,93.212204
Fence,80.425094
FireplaceQu,48.680151
LotFrontage,16.660953
GarageQual,5.450806
GarageCond,5.450806
GarageFinish,5.450806
GarageYrBlt,5.450806


In [5]:
###START-------Testing testing 123####
all_data["PoolQC"] = all_data["PoolQC"].fillna("None")
all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")

all_data["Alley"] = all_data["Alley"].fillna("None")
all_data["Fence"] = all_data["Fence"].fillna("None")
all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None")

all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)    
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')    
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)    
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
try:
    all_data = all_data.drop(['Utilities'], axis=1)
except:
    pass
all_data["Functional"] = all_data["Functional"].fillna("Typ")
all_data['Electrical'] = all_data['Electrical'].fillna(train['Electrical'].mode()[0])
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(train['KitchenQual'].mode()[0])
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(train['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(train['Exterior2nd'].mode()[0])
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])
all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")


#MSSubClass=The building class
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)


#Changing OverallCond into a categorical variable
all_data['OverallCond'] = all_data['OverallCond'].astype(str)


#Year and month sold are transformed into categorical features.
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

In [6]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio


In [7]:
from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))

# shape        
print('Shape all_data: {}'.format(all_data.shape))

Shape all_data: (2917, 78)


In [8]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio


In [9]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))


Skew in numerical features: 

There are 59 skewed numerical features to Box Cox transform


In [10]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio


In [11]:
from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    all_data[feat] = boxcox1p(all_data[feat], lam)
    
all_data = pd.get_dummies(all_data)
print(all_data.shape)

(2917, 220)


In [12]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio


In [13]:
train = all_data[:ntrain]
test = all_data[ntrain:]

In [14]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

In [15]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [18]:
lasso = make_pipeline(Lasso(alpha =0.0005, random_state=1))


In [19]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 0.1134 (0.0077)



<h3>Encode label for categorical data</h3>

In [None]:
col_cats = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir','OverallCond', 
        'YrSold', 'MoSold', 'GarageYrBlt']

for col_cat in col_cats:
    lab = LabelEncoder()
    train[col_cat]=lab.fit_transform(all_data[col_cat])
train.shape

<h3>Feature Crafting</h3>

In [None]:
train['GarageAreaperCar'] = train['GarageArea']/train['GarageCars']
train.loc[train['GarageCars']==0,'GarageAreaperCar'] = 0 
train['TotalArea'] = train['LotArea'] + train['MasVnrArea'] + \
                    train['TotalBsmtSF'] + train['GrLivArea'] + \
                    train['GarageArea'] + train['WoodDeckSF'] +  \
                    train['OpenPorchSF'] + train['EnclosedPorch'] + \
                    train['PoolArea'] + train['3SsnPorch']

In [1]:
print("a", "b")

a b


In [None]:
missing_data = (train.isnull().sum()/len(train))
missing_data = missing_data.drop(missing_data[missing_data==0].index).sort_values(ascending=False)
print(missing_data)

<h3>Skewness Handling</h3>

In [None]:
numeric_feats = train.dtypes[train.dtypes != "object"].index
# Check the skew of all numerical features
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})

In [None]:
skewness = skewness[abs(skewness) > 0.7]

skewed_features = skewness.index
for feat in skewed_features:
    if feat != 'SalePrice':
        train[feat] = boxcox1p(train[feat], 0.15)

In [None]:
train = pd.get_dummies(train)

<h3>Train-Test Split</h3>

In [None]:
train_sample, test_sample = train_test_split(train, test_size=0.3, random_state = 1)
X_train = train_sample.drop(['SalePrice'], axis=1)
X_test = test_sample.drop(['SalePrice'], axis=1)
y_train = train_sample['SalePrice']
y_test = test_sample['SalePrice']

<h2> Model Selection</h2>

In [None]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler

In [None]:
def rmsle(y_true, y_pred):
    return (np.square(np.log(y_pred)-np.log(y_true)).mean()) ** 0.5
def rmsle_cv(model, X, y, folds=5):
    return cross_val_score(model,X,y,scoring = scorer,cv=folds)
def scorer(model,X, y):
    return rmsle(model.predict(X), y)

In [None]:
lasso = linear_model.Lasso(alpha=1, max_iter =50000)
lasso.fit(X_train, y_train)
rmsle_cv(lasso, X_train, y_train).mean()

In [None]:
data = pd.DataFrame({"Feature Importance":lasso.coef_}, index=X_train.columns)
data.sort_values("Feature Importance", ascending=False)

In [None]:
rf = make_pipeline(RobustScaler(), RandomForestRegressor(max_features='auto'))

rmsle_cv(rf, X_train, y_train).mean()

In [None]:
reg = linear_model.Ridge(alpha=.5, normalize=True)
reg.fit(X_train, y_train)
rmsle_cv(reg,X_train, y_train).mean()

In [None]:
data = pd.DataFrame({"Feature Importance":reg.coef_}, index=X_train.columns)
data.sort_values("Feature Importance", ascending=False)[:20]

In [None]:
pca = PCA(n_components=300)
pca.fit(X_train)
X_train_pca =  pca.transform(X_train)
lasso.fit(X_train_pca, y_train)

In [None]:
rf.fit(X_train_pca, y_train)
rmsle(rf.predict(pca.transform(X_test)), y_test)

In [None]:
lasso.predict(X_test)

In [None]:
f, ax = plt.subplots(figsize=(15,12))
train_corr = train.corr()['SalePrice'].sort_values()
train_corr.plot.bar()
plt.show()
train_corr

In [None]:
attribute_of_interest_temp = train_corr[-25:].index
attribute_of_interest_temp2 = train_corr[:15].index
attribute_of_interest_temp = attribute_of_interest_temp.drop(['SalePrice'])
attribute_of_interest = list(attribute_of_interest_temp) + list(attribute_of_interest_temp2)


In [None]:
train_sample_selective, test_sample_selective = train_test_split(train, test_size=0.3, random_state = 1)
X_train = train_sample_selective.drop(['SalePrice'], axis=1)[attribute_of_interest]
X_test = test_sample_selective.drop(['SalePrice'], axis=1)[attribute_of_interest]
y_train = train_sample_selective['SalePrice']
y_test = test_sample_selective['SalePrice']

In [None]:
rf = RandomForestRegressor(max_features='auto')
rf.fit(X_train, y_train)
rmsle_cv(rf, X_train, y_train).mean()

In [None]:
f, ax = plt.subplots(figsize=(15,12))
sns.heatmap(train_subset.corr(), vmax=.8, square=True, annot=True)

In [None]:
remove_features = ['TotalBsmtSF', 'GarageCars', 'GarageYrBlt']
train_subset = train_subset.drop(remove_features, axis=1)
attribute_of_interest = [feature for feature in attribute_of_interest if feature not in remove_features]

In [None]:
train_subset_filtered = train_subset_filtered[train_subset_filtered['SalePrice']<600000]
train_subset_filtered = train_subset_filtered[train_subset_filtered['BsmtFinSF1']<2000]
train_subset_filtered = train_subset_filtered[train_subset_filtered['TotRmsAbvGrd']<13]
train_subset_filtered = train_subset_filtered[train_subset_filtered['1stFlrSF']<3000]
train_subset_filtered = train_subset_filtered[train_subset_filtered['MasVnrArea']<1200]

In [None]:
fig, axs = plt.subplots(ncols=1, nrows=len(attribute_of_interest)-1, figsize=(10, 100))
for i,index in enumerate(attribute_of_interest):
    if index!='SalePrice':
        if index == 'OverallQual':
            plt.xlim(0, 12)
        sns.regplot(train_subset_filtered[index], train_subset_filtered['SalePrice'], fit_reg=False,ax=axs[i])


In [None]:
X_train = train_subset_filtered.drop(['SalePrice'], axis=1)
y_train = train_subset_filtered['SalePrice']