# Regression on Housing Prices

Using regression techniques to predict prices on houses in Ames, IA.

In [1]:
#Importing necessary modules

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

# Importing Metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold

#Importing models and model selection
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

%matplotlib inline


In [2]:
training = pd.read_csv("train.csv")
training.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Data Cleanup

Creating a function that converts a certain type of string ordinal to a numeric ordinal value

In [3]:
def transform_ordinal(row):
    if row == "Ex":
        return 5
    elif row == "Gd":
        return 4
    elif row == "TA":
        return 3
    elif row == "Fa":
        return 2
    elif row == "Po":
        return 1
    else:
        return 0

Creating some functions that will:
- Transform the dataframe to remove non-applicable or ineffective features
- Choose featuers that only have strong correlations to the sale price of a house
- Generate an RMSE value based on the number of K folds selected by the user

In [4]:
# Creating a function that transforms features
def transform_features(df, for_test=False, train_columns=[]):
    tdf = df.copy()
    
    # Dropping the following columns:
    ## Order = an index number that adds no value to regression
    ## PID = A unique identifier that does not add value
    ## Misc Val, Mo Sold, Yr Sold, Sale Type, and Sale Condition = columns that leak data
    ## Garage Yr Blt = Ordinal column that does not add value to regression
    ## Lot Shape,Utilities, Land Slope, BsmtFin Type 1,BsmtFin Type 1,
    ##     Electrical,Functional,Garage Finish,Paved Drive, and Fence = Ordinal values using strings, too time consuming to convert 
    tdf = tdf.drop(columns=['Id','MiscVal', 'MoSold', 'YrSold', 'SaleType',
           'SaleCondition','GarageYrBlt','LotShape','Utilities', 'LandSlope', 
           'BsmtFinType1','BsmtFinType1','Electrical','Functional',
            'GarageFinish','PavedDrive','Fence'])
    
    # Trainforming the original string values to numeric ones
    ord_transform = ['ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','GarageQual','GarageCond','PoolQC','HeatingQC','KitchenQual','FireplaceQu']
    for o in ord_transform:
        tdf[o] = tdf[o].apply(transform_ordinal)
        
    # Removing any columns that have more than 40% (used to be 25%) of its rows populated
    # with null values (only if the dataframe passed in isn't for testing)
    if for_test==False:
        null_counts = tdf.isnull().sum()
        null_cutoff = null_counts[(null_counts/tdf.shape[0]) < 0.40] 
        df_u25 = tdf[null_cutoff.index]
    else:
        df_u25 = tdf.copy()
    
    # Implementing one hot encoding for categorical variables
    text_cols = df_u25.select_dtypes(include=['object']).columns
    for col in text_cols:
        dummy_cols = pd.get_dummies(df_u25[col], prefix=col)
        df_u25 = pd.concat([df_u25,dummy_cols], axis=1)
        df_u25.drop(columns=[col],inplace=True)
        
    # Getting a list of columns that have at most 10% (used to be 5%) of its rows populated
    # with null values, as well as a second list for those columns with
    # more than 5% 
    under_5_p_col = df_u25.isnull().sum()[((df_u25.isnull().sum()/df_u25.shape[0]) < .10
                                       ) & ((df_u25.isnull().sum()/df_u25.shape[0]) > 0
                                       )].index
    over_5_p_col = df_u25.isnull().sum()[(df_u25.isnull().sum()/df_u25.shape[0]) >= .10].index
    
    # Replacing the null values with the column mode for 
    # columns that have at most 5% of its rows populated
    # with null values
    for c in under_5_p_col:
        df_u25[c] = df_u25[c].fillna(df_u25[c].mode()[0]) 

    # Replacing the null values with the column mean for 
    # columns that have more than 5% of its rows populated
    # with null values
    for c in over_5_p_col:
        df_u25[c] = df_u25[c].fillna(df_u25[c].mean())

    # Adding a column that contains the difference between the year a house was remodeled and
    # the year it was built, since age is a better regression factor than a year
    df_u25['years_until_remod'] = df_u25['YearRemodAdd'] - df_u25['YearBuilt']

    # Scaling the remaining columns using min max normalization
    features = list(df_u25.columns)
    if 'SalePrice' in features:
        features.remove('SalePrice')
    for c in features:
        df_u25[c] = (df_u25[c] - df_u25[c].min()) / (df_u25[c].max() - df_u25[c].min())
        
    # Add any dummy columns that exist in the training set, but not in the test set
    if for_test==True:
        for tc in train_columns:
            if tc not in df_u25.columns:
                df_u25[tc] = 0
                
    return df_u25

Creating a function that chooses the best set of features using SelectFromModel

In [5]:
def select_feature(df):
    # Removing non-numeric columns or columns with null values
    initial_cols = df.columns
    for cols in initial_cols:
        if (df[cols].dtype == 'object') or (df[cols].dtype.name == 'category') or (df[cols].isnull().sum() > 0):
            df.drop(columns=cols, inplace=True)
                
    # Creating dataframes for features and target columns
    all_X = df.copy()
    all_X.drop(columns=['SalePrice'], inplace=True)
    all_y = df['SalePrice']

    # Instantiating an instance of a Gradient Boosting Regressor
    gb = GradientBoostingRegressor()

    # Using RFECV to attain the optimal set of features 
    selector = RFECV(gb)
    #selector = SelectFromModel(rfr)
    selector.fit(all_X,all_y)

    # Returning the optimal feature set
    optimized = all_X.columns[selector.get_support()]
    print("The optimal feature set ")
    print(optimized)
    return optimized

Creating a function to find the best model and hyperparameters via Random Search.  For the sake of brevity, only three types of algorithms are considered: Gradient Boosting Regressor, Random Forest Regressor, and K Neighbors Regressor.  

In [6]:

def select_model_random(df, features):
    # Creating dataframes for features and target columns
    all_X = df[features]
    all_y = df['SalePrice']
    
    # Creating a list of models with hyperparameters
    models = [
        {
            "name": "GradientBoostingRegressor",
            "estimator": GradientBoostingRegressor(),
            "hyperparameters":
                {
                    "n_estimators": np.linspace(5,500,100,dtype='int'),
                    "subsample": np.linspace(0.1,1,11),
                    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
                    "loss": ['ls', 'lad', 'huber', 'quantile'],
                    "criterion": ["friedman_mse","mse", "mae"],
                    "max_depth": np.linspace(2,11,10, dtype='int'),
                    "max_features": ["log2","sqrt","auto"],
                    "min_samples_leaf": np.linspace(1,10,10, dtype='int'),
                    "min_samples_split": np.linspace(2,11,10, dtype='int')
                }
        },
        {
            "name": "KNeighborsRegressor",
            "estimator": KNeighborsRegressor(),
            "hyperparameters":
                {
                    "n_neighbors": np.linspace(1,100,50, dtype="int"),
                    "weights": ["distance", "uniform"],
                    "algorithm": ["ball_tree", "kd_tree", "brute"],
                    "p": [1,2]
                }
        },
        {
            "name": "RandomForestRegressor",
            "estimator": RandomForestRegressor(),
            "hyperparameters":
                {         
                    "n_estimators": np.linspace(5,500,100,dtype='int'),
                    "criterion": ["friedman_mse","mse", "mae"],
                    "max_depth": np.linspace(2,11,10, dtype='int'),
                    "max_features": ["log2","sqrt","auto"],
                    "min_samples_leaf": np.linspace(1,10,10, dtype='int'),
                    "min_samples_split": np.linspace(2,11,10, dtype='int')
                }
        }
    ]
    
    # Finding the best configuration for each type of model in the list above
    for m in models:
        print(m["name"])
        rand = RandomizedSearchCV(m['estimator'], param_distributions=m['hyperparameters'],n_iter = 25, cv=4, scoring='r2')
        rand.fit(all_X,all_y)
        m['best_params'] = rand.best_params_
        m['best_score'] = rand.best_score_
        m['best_estimator'] = rand.best_estimator_
        print("Best set of parameters: ")
        print(m['best_params'])
        print("Best score:")
        print(m['best_score'])
        print("")
        print("")
        
    return models
        

Once the functions are defined, the training data is run through each function to attain the optimal set of features, and then the optimal model (including hyperparameters).  

In [7]:
training = transform_features(training)

In [8]:
best_features = select_feature(training)
print('........')




The optimal feature set 
Index(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'ExterCond', 'BsmtQual',
       ...
       'Heating_Wall', 'CentralAir_N', 'CentralAir_Y', 'GarageType_2Types',
       'GarageType_Attchd', 'GarageType_Basment', 'GarageType_BuiltIn',
       'GarageType_CarPort', 'GarageType_Detchd', 'years_until_remod'],
      dtype='object', length=163)
........


In [9]:
best_models = select_model_random(training,best_features)
print('End')

GradientBoostingRegressor
Best set of parameters: 
{'subsample': 0.73, 'n_estimators': 395, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 8, 'loss': 'lad', 'learning_rate': 0.1, 'criterion': 'mse'}
Best score:
0.8847401787614582


KNeighborsRegressor
Best set of parameters: 
{'weights': 'distance', 'p': 1, 'n_neighbors': 9, 'algorithm': 'ball_tree'}
Best score:
0.7361411549108609


RandomForestRegressor
Best set of parameters: 
{'n_estimators': 155, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10, 'criterion': 'friedman_mse'}
Best score:
0.8538967796461534


End


## Testing the model and features

Once the features and models are selected, they will be used to attempt predicting housing prices from a test set.  

In [14]:
# Transforming the test data
holdout = pd.read_csv("test.csv")
train_columns = training.columns
ids = holdout['Id']
holdout = transform_features(holdout, True, train_columns)

In [13]:
# Creating a function that generates a CSV file for submission to Kaggle for results.

def save_submission_file(model, features, filename='submission.csv'):
    holdout_predictions = model['best_estimator'].predict(holdout[features])
    submission_df = {"Id": ids, "SalePrice": holdout_predictions}
    submission = pd.DataFrame(submission_df)
    print(submission.head(20))
    submission.to_csv(filename,index=False)
    
save_submission_file(best_models[2],best_features,'submission_four.csv')

      Id      SalePrice
0   1461  139068.401339
1   1462  162781.113521
2   1463  194768.071781
3   1464  190327.682474
4   1465  201901.567461
5   1466  191994.439161
6   1467  182035.050176
7   1468  185146.280245
8   1469  197242.497533
9   1470  139214.931936
10  1471  223724.134298
11  1472  106200.265665
12  1473  108398.746791
13  1474  150429.203614
14  1475  133243.809046
15  1476  338420.016595
16  1477  251189.782705
17  1478  302354.496851
18  1479  308345.220491
19  1480  381217.164759
