In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_table('AmesHousing.tsv')

Missing value inputation should be done before splliting to prevent train_test leakage! It does not metter if worse the performance. 
You cannot use any information from test set when training.

In [None]:
def predictor(features):
    #features = ['Gr Liv Area']
    x = data[features]
    y = data['SalePrice']
    x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=42,test_size = 1- .49829351535836175)
    scaler = MinMaxScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    lr = LinearRegression()
    lr.fit(x_train,y_train)
    predictions = lr.predict(x_test)
    rmse = mean_squared_error(y_test,predictions)**.5
    return rmse

In [None]:
predictor(['Gr Liv Area'])

57834.20956644962

Let's explore the data set before diving into feature transformation and so on

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
Order              2930 non-null int64
PID                2930 non-null int64
MS SubClass        2930 non-null int64
MS Zoning          2930 non-null object
Lot Frontage       2440 non-null float64
Lot Area           2930 non-null int64
Street             2930 non-null object
Alley              198 non-null object
Lot Shape          2930 non-null object
Land Contour       2930 non-null object
Utilities          2930 non-null object
Lot Config         2930 non-null object
Land Slope         2930 non-null object
Neighborhood       2930 non-null object
Condition 1        2930 non-null object
Condition 2        2930 non-null object
Bldg Type          2930 non-null object
House Style        2930 non-null object
Overall Qual       2930 non-null int64
Overall Cond       2930 non-null int64
Year Built         2930 non-null int64
Year Remod/Add     2930 non-null int64
Roof Style         29

In [None]:
data.head(5)

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


First of all, let's remove columns with more than 5% of missing values. There is no rule to this, but 5% is a good start.

In [None]:
cutoff = int(len(data)*.05)
print(cutoff)

146


In [None]:
missing_data = data.isnull().sum()

In [None]:
missing_data_5 = missing_data[(missing_data >=0) & (missing_data < cutoff)]
print('there are', len(missing_data), 'columns')
print('\n')
print('there are', len(missing_data_5),'columns with less than 5% missing values')

there are 82 columns


there are 71 columns with less than 5% missing values


In [None]:
data2 = data[missing_data_5.index] # Creates a new dataframe with selected columns

Let's check the new data frame `data2` information

In [None]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 71 columns):
Order              2930 non-null int64
PID                2930 non-null int64
MS SubClass        2930 non-null int64
MS Zoning          2930 non-null object
Lot Area           2930 non-null int64
Street             2930 non-null object
Lot Shape          2930 non-null object
Land Contour       2930 non-null object
Utilities          2930 non-null object
Lot Config         2930 non-null object
Land Slope         2930 non-null object
Neighborhood       2930 non-null object
Condition 1        2930 non-null object
Condition 2        2930 non-null object
Bldg Type          2930 non-null object
House Style        2930 non-null object
Overall Qual       2930 non-null int64
Overall Cond       2930 non-null int64
Year Built         2930 non-null int64
Year Remod/Add     2930 non-null int64
Roof Style         2930 non-null object
Roof Matl          2930 non-null object
Exterior 1st       29

## Feature Engineering

This step is concerned to manipulate (drop, keep or combine) columns that are potential good predictors of our target variable, the Sale Price.

1) First let's consider splitting numerical and categorical features in the data frame;

2) Then, we can choose in each subset of variables, the better ones by using correlation coefficients and other techniques

Starting with numerical features, some of them can cause data leaking. 

A feature is a potential leaking source if its value is determined from the target value. 

It generally occurs in situtations in which the state of the feature can only be defined after (after here is literally with respect to time frame) the state of the target variable. 

Consider the 'Year Sold' feature. We can only know the year the house was sold **after** it was sold. 

In practice, when we start using a model to predict house price's, the year sold would not be available as information!

Concerning numerical features, the year and month sold is not useful to input in the model. Also `PID` and `Order`

In [None]:
#Dropping Year and Month sold
data2 = data2.copy()
data2 = data2.drop(columns =['PID','Order','Mo Sold'])

Some numerical features are prone to subjectivity or even worse, unpractical to assess in many new houses. 

This is the case for example of `overall quality` and `Overall condition`. The cost to create such information may be very high and very subjective! We want to be able to predict the target with the minimum number of features that are easy to measure. Think if you work for a "Corretora" que quer fazer estimativa do preco de venda das casa. Quanto mais informacao vc precisar, mais vai demorar sua estimativa!

In [None]:
data2.columns

Index(['MS SubClass', 'MS Zoning', 'Lot Area', 'Street', 'Lot Shape',
       'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood',
       'Condition 1', 'Condition 2', 'Bldg Type', 'House Style',
       'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add',
       'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd',
       'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual', 'Exter Cond',
       'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual',
       'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Garage Cars',
       'Garage Area', 'Paved Drive', 'Wood Deck SF', 'Open Porch SF',
       'Enclosed 

In [None]:
#Dropping Subjective columns
drops = ['MS SubClass','Overall Qual','Overall Cond','Mas Vnr Type',
        'Mas Vnr Area','BsmtFin SF 1',
         'BsmtFin Type 1','BsmtFin Type 2','BsmtFin SF 2',
         'Bsmt Unf SF','Street','Lot Shape',
         'Land Contour','Lot Config','Land Slope','Neighborhood',
        'Condition 1', 'Condition 2','Bldg Type','House Style',
        'Roof Style','Roof Matl','Exter Qual',
        'Exter Cond','Foundation','Bsmt Cond','Bsmt Exposure','Heating',
        'Heating QC','Electrical','Kitchen Qual',
        'Half Bath', 'Bsmt Half Bath',
        'Functional','Fireplaces',
        'Garage Area','Paved Drive','Wood Deck SF',
        'Enclosed Porch', '3Ssn Porch', 'Screen Porch',
        'Misc Val', 'Sale Type', 'Sale Condition','Exterior 1st', 'Exterior 2nd'
        ]

data2 = data2.drop(columns=drops)



In [None]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 22 columns):
MS Zoning          2930 non-null object
Lot Area           2930 non-null int64
Utilities          2930 non-null object
Year Built         2930 non-null int64
Year Remod/Add     2930 non-null int64
Bsmt Qual          2850 non-null object
Total Bsmt SF      2929 non-null float64
Central Air        2930 non-null object
1st Flr SF         2930 non-null int64
2nd Flr SF         2930 non-null int64
Low Qual Fin SF    2930 non-null int64
Gr Liv Area        2930 non-null int64
Bsmt Full Bath     2928 non-null float64
Full Bath          2930 non-null int64
Bedroom AbvGr      2930 non-null int64
Kitchen AbvGr      2930 non-null int64
TotRms AbvGrd      2930 non-null int64
Garage Cars        2929 non-null float64
Open Porch SF      2930 non-null int64
Pool Area          2930 non-null int64
Yr Sold            2930 non-null int64
SalePrice          2930 non-null int64
dtypes: float64(3), int64

In [None]:
#Let's create a new feature Year since Remodeled
data2['year_until_remod'] = data2['Year Remod/Add'] - data2['Year Built']
data2['years_old'] = data2['Yr Sold'] - data2['Year Built']
data2 = data2.drop(columns = ['Yr Sold','Year Remod/Add','Year Built','Utilities'])

In [None]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 20 columns):
MS Zoning           2930 non-null object
Lot Area            2930 non-null int64
Bsmt Qual           2850 non-null object
Total Bsmt SF       2929 non-null float64
Central Air         2930 non-null object
1st Flr SF          2930 non-null int64
2nd Flr SF          2930 non-null int64
Low Qual Fin SF     2930 non-null int64
Gr Liv Area         2930 non-null int64
Bsmt Full Bath      2928 non-null float64
Full Bath           2930 non-null int64
Bedroom AbvGr       2930 non-null int64
Kitchen AbvGr       2930 non-null int64
TotRms AbvGrd       2930 non-null int64
Garage Cars         2929 non-null float64
Open Porch SF       2930 non-null int64
Pool Area           2930 non-null int64
SalePrice           2930 non-null int64
year_until_remod    2930 non-null int64
years_old           2930 non-null int64
dtypes: float64(3), int64(14), object(3)
memory usage: 457.9+ KB


## Finding the correlation of numerical features with target variable

In [None]:
data2['year_until_remod'] = data2['year_until_remod'].replace(-1,0)
data2.loc[data2['years_old'] < 0,'years_old'] = 0
data2_num = data2.select_dtypes(include=['float64','int64'])

In [None]:
corr_num = data2_num.corr()['SalePrice'].abs()
corr_num.sort_values(ascending=False)

SalePrice           1.000000
Gr Liv Area         0.706780
Garage Cars         0.647877
Total Bsmt SF       0.632280
1st Flr SF          0.621676
years_old           0.558914
Full Bath           0.545604
TotRms AbvGrd       0.495474
Open Porch SF       0.312951
Bsmt Full Bath      0.276050
2nd Flr SF          0.269373
Lot Area            0.266549
year_until_remod    0.240165
Bedroom AbvGr       0.143913
Kitchen AbvGr       0.119814
Pool Area           0.068403
Low Qual Fin SF     0.037660
Name: SalePrice, dtype: float64

In [None]:
corr_num = corr_num.drop('SalePrice')

In [None]:
best_num_features = corr_num[corr_num>.4].index

In [None]:
best_num_features

Index(['Total Bsmt SF', '1st Flr SF', 'Gr Liv Area', 'Full Bath',
       'TotRms AbvGrd', 'Garage Cars', 'years_old'],
      dtype='object')

## Selecting the categorical features

In [None]:
data2_cat = data2.select_dtypes(include=['object'])

In [None]:
data2_cat.head(5)

Unnamed: 0,MS Zoning,Bsmt Qual,Central Air
0,RL,TA,Y
1,RH,TA,Y
2,RL,TA,Y
3,RL,TA,Y
4,RL,Gd,Y


In [None]:
#MS Zoning
data2_cat['MS Zoning'].value_counts(dropna=False,normalize=True)

RL         0.775768
RM         0.157679
FV         0.047440
RH         0.009215
C (all)    0.008532
A (agr)    0.000683
I (all)    0.000683
Name: MS Zoning, dtype: float64

**We see there is a reasonable variability of the classes even with a predominant one. We will keep it as it is.** You need to deal with this

In [None]:
data2_cat['Bsmt Qual'].value_counts(dropna=False,normalize=True)

TA     0.437884
Gd     0.416041
Ex     0.088055
Fa     0.030034
NaN    0.027304
Po     0.000683
Name: Bsmt Qual, dtype: float64

The are 2% of missing value that we will replace with the two most common values (Ta) and (Gd) with the same proportion they apprear in the original dataset.

In [None]:
data2_cat['Central Air'].value_counts(normalize=True,dropna=False)

Y    0.933106
N    0.066894
Name: Central Air, dtype: float64

Most of the houses in the data set have air conditioning, which may lead to a poor way to assess how this variable influences the SalePrice. Let's drop it

In [None]:
data2_cat = data2_cat.drop(columns = 'Central Air')

In [None]:
data2_cat.columns

Index(['MS Zoning', 'Bsmt Qual'], dtype='object')

In [None]:
best_num_features

Index(['Total Bsmt SF', '1st Flr SF', 'Gr Liv Area', 'Full Bath',
       'TotRms AbvGrd', 'Garage Cars', 'years_old'],
      dtype='object')

## Data Preprocessing

In [None]:
#Features selected
features = best_num_features.tolist()
features = features + data2_cat.columns.tolist()
target = ['SalePrice']
#Splitting the dataframe in two parts
s_data2 = data2[features + target].sample(random_state = 1,frac=1) #Shuffled dataframe
test_frac = .6 #Splitting fraction
idx_test = int(test_frac*(len(s_data2))) #index to split the dataframe

X_train = s_data2.iloc[0:idx_test,:][features]
Y_train = s_data2['SalePrice'][0:idx_test]
XY_test = s_data2.iloc[idx_test:,:][features + target]

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1758 entries, 2126 to 1669
Data columns (total 9 columns):
Total Bsmt SF    1758 non-null float64
1st Flr SF       1758 non-null int64
Gr Liv Area      1758 non-null int64
Full Bath        1758 non-null int64
TotRms AbvGrd    1758 non-null int64
Garage Cars      1757 non-null float64
years_old        1758 non-null int64
MS Zoning        1758 non-null object
Bsmt Qual        1711 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 137.3+ KB


### Inputation on Numerical features

In [None]:
#Lets do some inputation on numerical columns
X_train_num = X_train.select_dtypes(include = ['float64','int64'])
X_train_num = X_train_num.fillna(X_train_num.mean().to_dict(),axis=0)

In [None]:
X_train_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1758 entries, 2126 to 1669
Data columns (total 7 columns):
Total Bsmt SF    1758 non-null float64
1st Flr SF       1758 non-null int64
Gr Liv Area      1758 non-null int64
Full Bath        1758 non-null int64
TotRms AbvGrd    1758 non-null int64
Garage Cars      1758 non-null float64
years_old        1758 non-null int64
dtypes: float64(2), int64(5)
memory usage: 109.9 KB


### Normalizing Numerical Features

In [None]:
#X_train_num = X_train_num.apply(lambda x: (x-x.min())/(x.max()-x.min()))

###  inputation on categorical Features

In [None]:
#Inputation on categorical columns
X_train_cat = X_train.select_dtypes(include = ['object'])
X_train_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1758 entries, 2126 to 1669
Data columns (total 2 columns):
MS Zoning    1758 non-null object
Bsmt Qual    1711 non-null object
dtypes: object(2)
memory usage: 41.2+ KB


In [None]:
X_train_cat.mode() # Visualizing the mode of each column

Unnamed: 0,MS Zoning,Bsmt Qual
0,RL,TA


In [None]:
 X_train_cat = X_train_cat.fillna(X_train_cat.mode().iloc[0],axis=0) #Filling NaNs with Mode

In [None]:
X_train_cat.isnull().sum() #Checking if worked

MS Zoning    0
Bsmt Qual    0
dtype: int64

### Converting categorical feature to dummy variables

In [None]:
X_train_dcat = pd.get_dummies(X_train_cat) #Using this very handy function

Removing columns with less than 5% or more than 95% ones

In [None]:
#Percentage ones columns
perc_fil = X_train_dcat.sum()/len(X_train_dcat)
perc_fil.sort_values(ascending=False)
cut = 4/100
higher_5 = perc_fil[perc_fil >= cut]
higher_5

MS Zoning_FV    0.042662
MS Zoning_RL    0.775882
MS Zoning_RM    0.160978
Bsmt Qual_Ex    0.087031
Bsmt Qual_Gd    0.408419
Bsmt Qual_TA    0.474403
dtype: float64

In [None]:
X_train_dcat = X_train_dcat[higher_5.index]

### Joining the dummy feature with numerical features

In [None]:
X_train_final = pd.concat([X_train_num,X_train_dcat],axis = 1)

In [None]:
X_train_final.head(6)

Unnamed: 0,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,TotRms AbvGrd,Garage Cars,years_old,MS Zoning_FV,MS Zoning_RL,MS Zoning_RM,Bsmt Qual_Ex,Bsmt Qual_Gd,Bsmt Qual_TA
2126,990.0,990,990,1,5,0.0,13,0,1,0,0,1,0
192,1108.0,1160,2068,1,8,1.0,88,0,1,0,0,1,0
2406,1368.0,1368,1368,2,6,2.0,1,0,1,0,0,1,0
45,1358.0,1358,1358,2,6,2.0,1,0,1,0,0,1,0
2477,941.0,941,1837,2,7,2.0,9,0,1,0,0,1,0
1603,876.0,923,923,1,5,1.0,24,0,1,0,0,0,1


## Setting up the predictor

In [None]:
import numpy as np
def predictor2(features):
    #Cleaning NaN rows in test set
    X_clean_test = XY_test.dropna(axis=0)
    X_test = X_clean_test.drop(columns='SalePrice')
    Y_test = X_clean_test['SalePrice']
    #Split features into numerical and categorical
    X_test_num = X_test.select_dtypes(include=['float64','int64'])
    X_test_cat = X_test.select_dtypes(include = ['object'])
    X_test_dcat = pd.get_dummies(X_test_cat)
    #X_test_final = pd.concat([X_test_num,X_test_dcat],axis=1)
    
    filling = {}
    for name in X_train_dcat.columns:
        if name in X_test_dcat.columns:
            filling[name] = X_test_dcat[name]
        else:
            filling[name] = np.zeros(int(len(X_test_num)))
                
    X_test_dcat2 = pd.DataFrame(filling)            
    X_test_final = pd.concat([X_test_num,X_test_dcat2],axis=1)
    X_test_final = X_test_final[X_train_final.columns]

    #Scaling
    scaler = MinMaxScaler()
    x_train_scaled = scaler.fit_transform(X_train_final)
    x_test_scaled = scaler.transform(X_test_final)
    
    
    lr = LinearRegression()
    lr.fit(x_train_scaled,Y_train)
    test_predictions = lr.predict(x_test_scaled)
    train_predictions = lr.predict(x_train_scaled)
    test_rmse = mean_squared_error(Y_test,test_predictions)**.5
    train_rmse = mean_squared_error(Y_train,train_predictions)**.5
    return (test_rmse,train_rmse,test_rmse/Y_test.mean())

In [None]:
predictor2('a')

(37659.512555384266, 36687.65863509922, 0.20400955379550556)

In [None]:
#Cleaning NaN rows in test set
X_clean_test = XY_test.dropna(axis=0)
X_test = X_clean_test.drop(columns='SalePrice')
Y_test = X_clean_test['SalePrice']
 #Split features into numerical and 
X_test_num = X_test.select_dtypes(include=['float64','int64'])
X_test_cat = X_test.select_dtypes(include = ['object'])
X_test_dcat = pd.get_dummies(X_test_cat)
#X_test_final = pd.concat([X_test_num,X_test_dcat],axis=1)
    
filling = {}
for name in X_train_dcat.columns:
    if name in X_test_dcat.columns:
           filling[name] = X_test_dcat[name]
    else:
        filling[name] = np.zeros(int(len(X_test_num)))
                
X_test_dcat2 = pd.DataFrame(filling)            
X_test_final = pd.concat([X_test_num,X_test_dcat2],axis=1)
X_test_final = X_test_final[X_train_final.columns]

# DICA: INVESTIGUE MAIS ESSES CASOS EM QUE UMA COLUNA CATEGORICA NAO APARECE NO TESTE! É MELHOR VOCE TREINAR SEM ESSAS CATEGORIAS

In [None]:
X_test_final.shape

(1139, 13)

In [None]:
X_train_final.shape

(1758, 13)