In [None]:
import numpy as np
from numpy import array
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, f_regression, SelectPercentile
from sklearn.tree import export_graphviz
from sklearn import tree
from category_encoders import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
import os

In [None]:
data = pd.read_csv('datasets/AmesHousing.csv')

In [None]:
# any columns with nan?
pd.options.display.max_rows = 4000
#print(data.isna().any())

# deal with nan
values = {
    'Lot Frontage': 0,
    'Alley': 'NA',
    'Mas Vnr Type': 'None',
    'Mas Vnr Area': 0,
    'Bsmt Qual': 'NA',
    'Bsmt Cond': 'NA',
    'Bsmt Exposure': 'NA',
    'BsmtFin Type 1': 'NA',
    'BsmtFin Type 2': 'NA',
    'BsmtFin SF 1': 0,
    'BsmtFin SF 2': 0,
    'Bsmt Unf SF': 0,
    'Total Bsmt SF': 0,
    'Electrical': 'Mix',
    'Bsmt Full Bath': 0,
    'Bsmt Half Bath': 0,
    'Fireplace Qu': 'NA',
    'Garage Type': 'NA',
    'Garage Yr Blt': 0,
    'Garage Finish': 'NA',
    'Garage Cars': 0,
    'Garage Area': 0,
    'Garage Qual': 'NA',
    'Garage Cond': 'NA',
    'Pool QC': 'NA',
    'Fence': 'NA',
    'Misc Feature': 'NA'
}

data = data.fillna(value = values)

In [None]:
# one hot encoding of nominal features
nominal_features = ['MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature', 'Sale Type', 'Sale Condition']

data_ohe = pd.get_dummies(data, columns = nominal_features, drop_first=True)

In [None]:
# ordinal encoding of ordinal features
ordinal_cols_mapping = [
    {
        'col': 'Lot Shape',    
        'mapping': {
            'Reg': 3, 
            'IR1': 2, 
            'IR2': 1, 
            'IR3': 0
        }
    },
    {
        'col': 'Utilities', 
        'mapping': {
            'AllPub': 3,
            'NoSewr': 2,
            'NoSeWa': 1,
            'ELO': 0
        }
    },
    {
        'col': 'Land Slope', 
        'mapping': {
            'Gtl': 2,
            'Mod': 1,
            'Sev': 0
        }
    },
    {
        'col': 'Exter Qual', 
        'mapping': {
            'Ex': 4,
            'Gd': 3,
            'TA': 2,
            'Fa': 1,
            'Po': 0
        }
    },
    {
        'col': 'Exter Cond', 
        'mapping': {
            'Ex': 4,
            'Gd': 3,
            'TA': 2,
            'Fa': 1,
            'Po': 0
        }
    },
    {
        'col': 'Bsmt Qual', 
        'mapping': {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            'NA': 0
        }
    },
    {
        'col': 'Bsmt Cond', 
        'mapping': {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            'NA': 0
        }
    },
    {
        'col': 'Bsmt Exposure', 
        'mapping': {
            'Gd': 4,
            'Av': 3,
            'Mn': 2,
            'No': 1,
            'NA': 0
        }
    },
    {
        'col': 'BsmtFin Type 1', 
        'mapping': {
            'GLQ': 6,
            'ALQ': 5,
            'BLQ': 4,
            'Rec': 3,
            'LwQ': 2,
            'Unf': 1,
            'NA': 0
        }
    },
    {
        'col': 'BsmtFin Type 2',
        'mapping': {
            'GLQ': 6,
            'ALQ': 5,
            'BLQ': 4,
            'Rec': 3,
            'LwQ': 2,
            'Unf': 1,
            'NA': 0
        }
    },
    {
        'col': 'Heating QC', 
        'mapping': {
            'Ex': 4,
            'Gd': 3,
            'TA': 2,
            'Fa': 1,
            'Po': 0
        }
    },
    {
        'col': 'Electrical', 
        'mapping': {
            'SBrkr': 4,
            'FuseA': 3,
            'FuseF': 2,
            'FuseP': 1,
            'Mix': 0
        }
    },
    {
        'col': 'Kitchen Qual', 
        'mapping': {
            'Ex': 4,
            'Gd': 3,
            'TA': 2,
            'Fa': 1,
            'Po': 0
        } 
    },
    {
        'col': 'Functional', 
        'mapping': {
            'Typ': 7,
            'Min1': 6,
            'Min2': 5,
            'Mod': 4,
            'Maj1': 3,
            'Maj2': 2,
            'Sev': 1,
            'Sal': 0,
        }
    },
    {
        'col': 'Fireplace Qu', 
        'mapping': {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            'NA': 0
        }
    },
    {
        'col': 'Garage Finish', 
        'mapping': {
            'Fin': 3,
            'RFn': 2,
            'Unf': 1,
            'NA': 0,
        }
    },
    {
        'col': 'Garage Qual', 
        'mapping': {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            'NA': 0
        }
    },
    {
        'col': 'Garage Cond', 
        'mapping': {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            'NA': 0
        }
    },
    {
        'col': 'Paved Drive', 
        'mapping': {
            'Y': 2,
            'P': 1,
            'N': 0
        }
    },
    {
        'col': 'Pool QC', 
        'mapping': {
            'Ex': 4,
            'Gd': 3,
            'TA': 2,
            'Fa': 1,
            'NA': 0
        }
    },
    {
        'col': 'Fence', 
        'mapping': {
            'GdPrv': 4,
            'MnPrv': 3,
            'GdWo': 2,
            'MnWw': 1,
            'NA': 0
        }
    }
]

encoder = OrdinalEncoder(mapping = ordinal_cols_mapping, return_df = True)
data_ohe = encoder.fit_transform(data_ohe)

data_ohe.describe()

In [None]:
# dropping unique identifiers of the sale
data_ohe = data_ohe.drop(columns=['Order', 'PID'])

# dropping Year Built, using new column of Years Since Built instead
data_ohe = data_ohe.drop(columns=['Year Built'])

# dropping Year Remod/Add, using new column of Years Since Upgrade instead
data_ohe = data_ohe.drop(columns=['Year Remod/Add'])

# dropping Garage Yr Built, using new columns of Garage Years Since Built instead
data_ohe = data_ohe.drop(columns=['Garage Yr Blt'])

print(data_ohe.head())

In [None]:
X = data_ohe.drop('SalePrice', axis = 1)
y = data_ohe['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(X_train.shape)
print(X_test.shape)

In [None]:
# scaling of continuous, discrete and ordinal features
numerical_features = ['Lot Frontage', 'Lot Area', 'Years Since Built', 'Years Since Upgrade', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Years Since Built', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold']

nominal_features = ['MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature', 'Sale Type', 'Sale Condition']

ordinal_features = ['Lot Shape', 'Utilities', 'Land Slope', 'Exter Qual', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating QC', 'Electrical', 'Kitchen Qual', 'Functional', 'Fireplace Qu', 'Garage Finish', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Overall Qual', 'Overall Cond']

features_to_scale = numerical_features + ordinal_features

scaler = MinMaxScaler()

X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])


In [None]:
print(X_train.columns)

In [None]:
# correlation heatmap
numerical_features_with_target = ['SalePrice'] + numerical_features + ordinal_features
corr = data_ohe[numerical_features_with_target].corr(method='pearson')

f, ax = plt.subplots(figsize=(16, 12))
sns.heatmap(corr, square= True)

In [None]:
# selecting best features
print(X.shape)

k_best = SelectKBest(f_regression, k = 5)
k_best.fit(X_train, y_train)
X_train_k_best = k_best.transform(X_train)
X_test_k_best = k_best.transform(X_test)

print(X_train_k_best.shape)
print(X.columns[k_best.get_support()])


In [None]:
lin_reg = LinearRegression().fit(X_train_k_best, y_train)

print('Test score: {}'.format(lin_reg.score(X_test_k_best, y_test)))
print('Train score: {}'.format(lin_reg.score(X_train_k_best, y_train)))

scores = cross_val_score(LinearRegression(), X_train_k_best, y_train, cv = 5)
print('Mean cross-validation accuracy: {}'.format(np.mean(scores)))

In [None]:
param_grid_knn = {'n_neighbors': np.arange(1, 25)}
grid_knn = GridSearchCV(KNeighborsRegressor(), param_grid_knn, cv = 5)

grid_knn.fit(X_train_k_best, y_train)
print('Best cross-validation score: {}'.format(grid_knn.best_score_))
print('Best parameters:', grid_knn.best_params_)

In [None]:
# Number of trees in random forest
n_estimators = [100, 200, 300]
# Number of features to consider at every split
max_features = [10, 20, 30, 40]
# Maximum number of levels in tree
max_depth = [3, 5, 10, 20]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 4]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

ran_search = RandomizedSearchCV(RandomForestRegressor(), param_distributions = random_grid, n_iter = 20, cv = 2)

ran_search.fit(X_train, y_train)

print(ran_search.best_score_)
print(ran_search.best_params_)

In [None]:
# visualize the best tree with modified depth

best_forest = RandomForestRegressor(n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features=30, max_depth=3)
best_forest.fit(X_train, y_train)

rfr_file = export_graphviz(best_forest.estimators_[0], feature_names=X.columns, filled=True, rounded=True, out_file=None)
rfr_graph = pydotplus.graph_from_dot_data(rfr_file)
Image(rfr_graph.create_png())

# for saving a png
# rfr_graph.write_png("random_forest_regressor.png")