In [113]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns

# Getting to know the data

In [2]:
df_train = pd.read_csv('precos_imoveis.csv')
df_train['is_test'] = pd.Series(np.zeros(1460))
df_train.shape

(1460, 82)

In [3]:
df_train = df_train.sort_values(by='SalePrice')

In [4]:
y = np.log(df_train['SalePrice'])
del df_train['SalePrice']

In [5]:
df_test = pd.read_csv('test.csv')
df_test['is_test'] = pd.Series(np.zeros(1460)+1)
df_test.shape

(1459, 81)

In [6]:
df = pd.concat([df_train,df_test],sort=False)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,is_test
495,496,30,C (all),60.0,7879,Pave,,Reg,Lvl,AllPub,...,0,,GdWo,,0,11,2009,WD,Abnorml,0.0
916,917,20,C (all),50.0,9000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,10,2006,WD,Abnorml,0.0
968,969,50,RM,50.0,5925,Pave,,Reg,Lvl,AllPub,...,0,,GdWo,,0,5,2009,WD,Abnorml,0.0
533,534,20,RL,50.0,5000,Pave,,Reg,Low,AllPub,...,0,,,,0,1,2007,WD,Normal,0.0
30,31,70,C (all),50.0,8500,Pave,Pave,Reg,Lvl,AllPub,...,0,,MnPrv,,0,7,2008,WD,Normal,0.0


In [7]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [8]:
is_test = df.is_test

# Some feature engineering

In [9]:
#it is important to know the time from the last remod
df['LastRemod'] = df['YearRemodAdd'] - df['YearBuilt']

#total number of living square feets
df['UsableSF'] = df['1stFlrSF'] + df['2ndFlrSF']

df['Overall'] = df['OverallQual']*df['OverallCond']

In [10]:
#For sure 'Id' shouldn't be a feature and 'SalePrice' is what we want to predict
del df['Id'],df['is_test']

# I have to process the features

In [11]:
#piece of code that tries to get the discrete features
def get_discrete_features(df):
    
    discrete_features = []
    is_discrete = []
    d = df.dtypes.to_dict()
    for column in df.columns:
        #if the type of the column is a string
        if d[column] == 'O':
            discrete_features.append(column)
            is_discrete.append(True)
        #if all unique values are the only possible values
        elif (len(df[column].unique()) == df[column].max()) or (len(df[column].unique()) == df[column].max()-1):
            discrete_features.append(column)
            is_discrete.append(True)
        else:
            is_discrete.append(False)
            
    is_discrete = np.array(is_discrete)
    return discrete_features,is_discrete

In [12]:
discrete_features,is_discrete = get_discrete_features(df)
#I know MSSubClass is discrete and it fails my test
discrete_features.append('MSSubClass')
is_discrete[0] = True

In [13]:
#Looking at the continuous variables
df[df.columns[~is_discrete]].head()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,YrSold,LastRemod,UsableSF
495,60.0,7879,1920,1950,0.0,495.0,0.0,225.0,720.0,720,...,0,523,115,0,0,0,0,2009,30,720
916,50.0,9000,1949,1950,0.0,50.0,0.0,430.0,480.0,480,...,0,0,0,0,0,0,0,2006,1,480
968,50.0,5925,1910,1950,0.0,0.0,0.0,600.0,600.0,600,...,0,0,0,0,0,0,0,2009,40,968
533,50.0,5000,1946,1950,0.0,0.0,0.0,0.0,0.0,334,...,0,0,0,0,0,0,0,2007,4,334
30,50.0,8500,1920,1950,0.0,0.0,0.0,649.0,649.0,649,...,0,54,172,0,0,0,0,2008,30,1317


## First, dealing with the continuous variables

In [14]:
#I want to add a new column that says a given feature was NaN and substitute the NaN for the mean
for column in df.columns[~is_discrete]:
    has_na = df[column].isna().sum() > 0
    if has_na:
        x = df[column].values[~df[column].isna()]
        #replaces NaNs with the median
        df[column] = df[column].fillna(np.median(x))
        #adds a new column saying the value was missing
        df[column + '_missing'] = df[column].isna()

In [15]:
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [16]:
#I added eleven discrete columns
is_discrete=np.append(is_discrete,[True]*11)
discrete_features.extend(list(df.columns.values[-11:]))

## Now dealing with the discrete variables. First try to encode with dummy encoding

In [17]:
df_d = df[discrete_features]
df_d = df_d.fillna('NA')
df_d.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,MasVnrArea_missing,BsmtFinSF1_missing,BsmtFinSF2_missing,BsmtUnfSF_missing,TotalBsmtSF_missing,BsmtFullBath_missing,BsmtHalfBath_missing,GarageYrBlt_missing,GarageCars_missing,GarageArea_missing
495,C (all),Pave,,Reg,Lvl,AllPub,Inside,Gtl,IDOTRR,Norm,...,False,False,False,False,False,False,False,False,False,False
916,C (all),Pave,,Reg,Lvl,AllPub,Inside,Gtl,IDOTRR,Norm,...,False,False,False,False,False,False,False,False,False,False
968,RM,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,...,False,False,False,False,False,False,False,False,False,False
533,RL,Pave,,Reg,Low,AllPub,Inside,Mod,BrkSide,Norm,...,False,False,False,False,False,False,False,False,False,False
30,C (all),Pave,Pave,Reg,Lvl,AllPub,Inside,Gtl,IDOTRR,Feedr,...,False,False,False,False,False,False,False,False,False,False


In [18]:
#Making the custom encoding in the apropriate features
df_d['Street']=df_d['Street'].map({'Pave':0,'Grvl':1})
df_d['Alley']=df_d['Alley'].map({'NA':0,'Pave':1,'Grvl':2})
df_d['LotShape']=df_d['LotShape'].map({'IR3':0,'IR2':1,'IR1':2,'Reg':3})
df_d['LandContour']=df_d['LandContour'].map({'Low':0,'HLS':1,'Bnk':2,'Lvl':3})
df_d['Utilities']=df_d['Utilities'].map({'NA':-1,'ELO':0,'NoSeWa':1,'NoSewr':2,'AllPub':3})
df_d['LandSlope']=df_d['LandSlope'].map({'Sev':0,'Mod':1,'Gtl':2})
df_d['ExterQual']=df_d['ExterQual'].map({'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
df_d['ExterCond']=df_d['ExterCond'].map({'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
df_d['BsmtQual']=df_d['BsmtQual'].map({'NA':-1,'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
df_d['BsmtCond']=df_d['BsmtCond'].map({'NA':-1,'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
df_d['BsmtExposure']=df_d['BsmtExposure'].map({'NA':0,'No':1,'Mn':2,'Av':3,'Gd':4})
df_d['BsmtFinType1']=df_d['BsmtFinType1'].map({'NA':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6})
df_d['BsmtFinType2']=df_d['BsmtFinType2'].map({'NA':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6})
df_d['HeatingQC']=df_d['HeatingQC'].map({'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
df_d['Electrical']=df_d['Electrical'].map({'NA':-1,'Mix':0,'FuseP':1,'FuseF':2,'FuseA':3,'SBrkr':4})
df_d['KitchenQual']=df_d['KitchenQual'].map({'NA':-1,'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
df_d['FireplaceQu']=df_d['FireplaceQu'].map({'NA':-1,'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
df_d['GarageFinish']=df_d['GarageFinish'].map({'NA':0,'Unf':1,'RFn':2,'Fin':3})
df_d['GarageCond']=df_d['GarageCond'].map({'NA':-1,'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
df_d['PavedDrive']=df_d['PavedDrive'].map({'N':0,'P':1,'Y':2})
df_d['PoolQC']=df_d['PoolQC'].map({'NA':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
df_d['Fence']=df_d['Fence'].map({'NA':0,'MnWw':1,'GdWo':2,'MnPrv':3,'GdPrv':4})


In [19]:
df_d.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,MasVnrArea_missing,BsmtFinSF1_missing,BsmtFinSF2_missing,BsmtUnfSF_missing,TotalBsmtSF_missing,BsmtFullBath_missing,BsmtHalfBath_missing,GarageYrBlt_missing,GarageCars_missing,GarageArea_missing
495,C (all),0,0,3,3,3,Inside,2,IDOTRR,Norm,...,False,False,False,False,False,False,False,False,False,False
916,C (all),0,0,3,3,3,Inside,2,IDOTRR,Norm,...,False,False,False,False,False,False,False,False,False,False
968,RM,0,0,3,3,3,Inside,2,OldTown,Norm,...,False,False,False,False,False,False,False,False,False,False
533,RL,0,0,3,0,3,Inside,1,BrkSide,Norm,...,False,False,False,False,False,False,False,False,False,False
30,C (all),0,1,3,3,3,Inside,2,IDOTRR,Feedr,...,False,False,False,False,False,False,False,False,False,False


In [20]:
df_d = pd.get_dummies(df_d,drop_first=True)

In [21]:
X = pd.concat([df_d,df[df.columns[~is_discrete]]], axis=1)

In [119]:
X.shape

(2919, 224)

# Creating a validation set

In [45]:
#In the beginning I ordered the training dataset. Now I'll sample 20% of the training to create the valid
idx = np.linspace(0,1459,0.2*1460,dtype=np.int)

In [46]:
X = X.reset_index()
del X['index']
X.head()

y = y.reset_index()
del y['index']

is_test = is_test.reset_index()
del is_test['index']

In [47]:
X_valid = X.loc[idx]
y_valid = y.loc[idx]

In [54]:
X_train = X.loc[:1459].drop(idx)
y_train = y.drop(idx)

In [55]:
X_train.shape,y_train.shape

((1168, 224), (1168, 1))

# Now lets try to fit a regressor

In [73]:
def rmse(y,y_pred): return ((np.sum((y-y_pred)**2))/len(y))**0.5

In [67]:
y_train = np.array(y_train)
y_train.shape = (1168,)

y_valid = np.array(y_valid)
y_valid.shape = (292,)

In [97]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1,min_samples_leaf=1)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_valid)
r = rmse(y_valid,y_pred)
r

0.1436845441669171

In [98]:
ab = AdaBoostRegressor(n_estimators=200, base_estimator= DecisionTreeRegressor())
ab.fit(X_train,y_train)
y_pred = ab.predict(X_valid)
r = rmse(y_valid,y_pred)
r

0.1407229875363053

In [109]:
gb = GradientBoostingRegressor(n_estimators=200,max_depth=5)
gb.fit(X_train,y_train)
y_pred = gb.predict(X_valid)
r = rmse(y_valid,y_pred)
r

0.13573922923848017

In [116]:
et = ExtraTreesRegressor(n_estimators=200,n_jobs=-1,)
et.fit(X_train,y_train)
y_pred = et.predict(X_valid)
r = rmse(y_valid,y_pred)
r

0.14628204076659135

In [158]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1,min_samples_leaf=3)
ab = AdaBoostRegressor(n_estimators=100, base_estimator= DecisionTreeRegressor())
gb = GradientBoostingRegressor(n_estimators=200,max_depth=5)
et = ExtraTreesRegressor(n_estimators=200,n_jobs=-1,)
ens = VotingRegressor([('rf',rf),('ab',ab),('gb',gb),('et',et)])
ens.fit(X_train,y_train)
y_pred = ens.predict(X_valid)
r = rmse(y_valid,y_pred)
r

0.13802010266274578

# Now getting the predictions to submit

In [129]:
gb = GradientBoostingRegressor(n_estimators=200,max_depth=5)
gb.fit(X_train,y_train)
y_pred = gb.predict(X.loc[(is_test==1).is_test])

In [152]:
ids = list(df_test.Id)

In [153]:
df_out = pd.DataFrame(list(zip(ids,np.exp(1)**y_pred)),columns=['Id','SalePrice'])

In [154]:
df_out['SalePrice'] = df_out['SalePrice'].apply(lambda x: int(x))

In [155]:
df_out.to_csv('submission.csv',index=False)

In [157]:
df_out.head()

Unnamed: 0,Id,SalePrice
0,1461,122076
1,1462,152877
2,1463,189102
3,1464,187837
4,1465,190524
