In [1]:
# The purpose of this file is to reflect the pre-processing steps that need to be
# applied to the test data set to confirm it to the data state that the linear model expects.
import janitor as jn
import pandas as pd
%pylab inline
from joblib import load
from sklearn.linear_model import LinearRegression

Populating the interactive namespace from numpy and matplotlib


In [2]:
def import_and_subset_cols(test_data_file, column_list):
    try:
        initial_df = pd.read_csv(filepath_or_buffer=test_data_file, usecols=column_list, index_col=0)
    except OSError as io_error:
        raise("You have provided a file path reference that doesn't exist. Please check the file path.")

    if not column_list:
        raise("You gave me an empty list you Pyscho :-).")

    return jn.clean_names(initial_df) # makes sure that the column names are lower case 

In [3]:
def feature_creation(cleaned_df):
    # adjust basement sqft to cap effect of outliers that I have high basement sf but that do not appear to be 
    # reflected in an increase in price 
    cleaned_df["totalbsmtsf"] = cleaned_df["totalbsmtsf"].apply(lambda x: 3000 if x > 3000 else x)

    # Combined square footage metrics for building sf and outside sf (e.g., porch space)
    bldg_sqft = cleaned_df[["totalbsmtsf", "1stflrsf", "2ndflrsf"]].sum(axis = "columns")
    outside_sf = cleaned_df[["wooddecksf", "openporchsf", "3ssnporch", "screenporch", "enclosedporch"]].sum(axis="columns")
    lot_sf = cleaned_df["lotarea"] - cleaned_df['1stflrsf'] - outside_sf
    lot_sf = lot_sf.apply(lambda x: 12500 if x > 12500 else x)

    # Combining above-basement and basement baths
    total_baths = cleaned_df["fullbath"].fillna(0)  + cleaned_df["halfbath"].fillna(0)/ 2 + cleaned_df["bsmtfullbath"].fillna(0) \
        + cleaned_df["bsmthalfbath"].fillna(0) / 2

    # Various Dummifications
    # 0=1 flat if the building type is a single family home
    sgl_famly_hm = cleaned_df["bldgtype"].apply(lambda x: 0 if x == '1Fam' else 1)

    # top 3 neighborhoods based upon median home price sales and general spread of prices based upon boxplot
    top_3_nbrhd = cleaned_df["neighborhood"].isin(["NridgHt", "NoRidge", "StoneBr"]).map({False: 0, True: 1})

    # bottom 5 neighborhoods based upon median home and boxplot inspection
    btm_5_nbrhd = cleaned_df["neighborhood"].isin(["MeadowV", "IDOTRR", "BrDale", "OldTown", "Edwards"]).map({False: 0, True: 1})

    # Fireplaces that are Excellent, Good or Typical/TA 
    good_frplc = cleaned_df["fireplacequ"].isin(["Ex", "Gd", "TA"]).map({False: 0, True: 1})

    # remodel age was general found to be more individually correlated with SalePrice than homeage than total home age
    remodelage = cleaned_df["yrsold"] - cleaned_df["yearremodadd"]
    remodelage = remodelage.apply(lambda x: 0 if abs(x) < 5 else x - 5)

    # Various measures where higher amenity ratings that were associated with higher home prices
    # these are being combined into a single "positive amentities count" feature 

    # 1 Excellent Heating (important for a cold place :-))
    excl_heating = cleaned_df["heatingqc"].isin(["Ex"]).map({False: 0, True: 1})

    # 2 basement has GLQ (Good Living Quarter) in either 
    bsmt_gd_lvg = (cleaned_df["bsmtfintype1"].isin(["GLQ"]) | cleaned_df["bsmtfintype2"].isin(["GLQ"])).map({False: 0, True:1})

    # 3 Good, Excellent and "Typical"/TA fireplaces
    good_frplc = cleaned_df["fireplacequ"].isin(["Ex", "Gd", "TA"]).map({False: 0, True: 1})

    # 4 Good and Excellent Kitchens being combined together 
    ktch_groups = cleaned_df["kitchenqual"].map({"TA": "ktch_okay", "Fa": "ktch_okay", "Gd": "ktch_good", "Ex": "kitch_topnotch"})
    ktch_dummies = pd.get_dummies(data=ktch_groups).drop("ktch_okay", axis="columns")

    # 5 Excellent Basement Quality
    excl_bsmt = cleaned_df["bsmtqual"].isin(["Ex"]).map({False: 0, True:1})
    excl_bsmt.sum()

    good_ament_ct = pd.concat([excl_heating, excl_bsmt, ktch_dummies, bsmt_gd_lvg, good_frplc], axis = "columns").sum(axis="columns")

    # Various measures where lower amenity ratings were associated with lower higher prices (versus the average/highly rated) 
    # these are being combined into a single "negative amentities count" feature

    # 1 No fireplace
    no_fireplace = cleaned_df["fireplaces"] == 0
    no_fireplace = no_fireplace.map({False: 0, True: 1})

    # 2 No Central AC
    no_central_ac = cleaned_df['centralair'].isin(['N']).map({False: 0, True:1}) 

    # Electirical aside from standard circuitbreaker
    bad_electrical = cleaned_df['electrical'].isin(['Mix', 'FuseP', 'FuseF', 'FuseA']).map({False: 0, True:1})

    bad_ament_ct = pd.concat([no_central_ac, no_fireplace, bad_electrical], axis="columns").sum(axis="columns")

    # remaining features

    #1 credit for having garage space for two or more cars
    two_plus_cr_garg = cleaned_df["garagecars"].apply(lambda x: 1 if x >= 2 else 0)

    # houses with a 4 or less overall condition showed on average lower sale price then
    # houses with an overall condition rating of 5 or higher 
    neg_ovrll_cond = cleaned_df["overallcond"].apply(lambda x: 1 if x <= 4 else 0)

    adj_lot_area = lot_sf.apply(lambda x: 25000 if x > 25000 else x)

    abnormal_sale = (cleaned_df["salecondition"] == "Abnorml").map({False: 0, True: 1}).fillna(0)

    adj_ovr_qual = cleaned_df["overallqual"].apply(lambda x: 0 if x <=3 else x - 3)


    # bringing it all together

    list_of_features = [
                        abnormal_sale, adj_lot_area, adj_ovr_qual, bad_ament_ct, bldg_sqft, btm_5_nbrhd, 
                        good_ament_ct, neg_ovrll_cond, outside_sf, remodelage, sgl_famly_hm, top_3_nbrhd,
                        total_baths, two_plus_cr_garg,
                        ]

    features_df = pd.concat(list_of_features, axis="columns")


    features_df.columns = [
                            'abnormal_sale', 'adj_lot_area', 'adj_ovr_qual', 'bad_ament_ct', 'bldg_sqft', 'btm_5_nbrhd', 
                           'good_ament_ct', 'neg_ovrll_cond', 'outside_sf', 'remodelage', 'sgl_famly_hm', 'top_3_nbrhd',
                           'total_baths', 'two_plus_cr_garg',
                           ]

    return features_df

In [4]:
#test_data_file_path = r".\data\test.csv"
test_data_file_path = r"C:\Users\jason\OneDrive\Documents\Jason\NYC Data Science Academy\projects\machine_learning\ghub_work_area\data\test.csv"
# model_file_path = r".\model_files\lr_log_model.joblib"
model_file_path = r"C:\Users\jason\OneDrive\Documents\Jason\NYC Data Science Academy\projects\machine_learning\ghub_work_area\model_files\lr_log_model.joblib"

import_list = ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Alley','LotShape', 'Neighborhood', 'Condition1',
              'Condition2', 'BldgType', 'HouseStyle','OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
              'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
              'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2','BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'CentralAir',
              'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
              'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces',
              'FireplaceQu', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch','ScreenPorch', 'YrSold',
              'SaleType', 'SaleCondition','Electrical',"HeatingQC","Fireplaces", "FireplaceQu", "BsmtQual", "BsmtFinType1",
              "BsmtFinType2", 'LotFrontage', 'LotArea', 'GarageCars', 'OverallCond',]


imported_df = import_and_subset_cols(test_data_file=test_data_file_path, column_list=import_list)
features_df = feature_creation(imported_df)

lr_model = load(model_file_path)

log_saleprice_predict = lr_model.predict(features_df)
log_saleprice_predict

array([11.6423242 , 11.94902678, 12.0749711 , ..., 11.96465725,
       11.74963601, 12.27604616])

In [5]:
predict_df = pd.DataFrame(
    np.exp(log_saleprice_predict),
    index=imported_df.index 
)

predict_df = predict_df.reset_index()

In [7]:
output_filepath = r"../predictions/linear_predictions.csv"
# output_filepath = r"C:\Users\jason\OneDrive\Documents\Jason\NYC Data Science Academy\projects\machine_learning\ghub_work_area\predictions\linear_predictions.csv"
predict_df.columns = ["Id", "SalePrice"]
predict_df.to_csv(output_filepath,index=False)

In [8]:
predict_df.shape


(1459, 2)

In [9]:
features_df.index



Int64Index([1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470,
            ...
            2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919],
           dtype='int64', name='Id', length=1459)

In [10]:
imported_df.loc[2550, ] 

mssubclass            20
mszoning              RL
lotfrontage          128
lotarea            39290
alley                NaN
lotshape             IR1
neighborhood     Edwards
condition1          Norm
condition2          Norm
bldgtype            1Fam
housestyle        1Story
overallqual           10
overallcond            5
yearbuilt           2008
yearremodadd        2009
exterior1st      CemntBd
exterior2nd      CmentBd
masvnrtype         Stone
masvnrarea          1224
bsmtqual              Ex
bsmtcond              TA
bsmtexposure          Gd
bsmtfintype1         GLQ
bsmtfinsf1          4010
bsmtfintype2         Unf
bsmtfinsf2             0
bsmtunfsf           1085
totalbsmtsf         3000
heatingqc             Ex
centralair             Y
electrical         SBrkr
1stflrsf            5095
2ndflrsf               0
lowqualfinsf           0
grlivarea           5095
bsmtfullbath           1
bsmthalfbath           1
fullbath               2
halfbath               1
bedroomabvgr           2


In [11]:
features_df.loc[2550, ]


abnormal_sale           0.0
adj_lot_area        12500.0
adj_ovr_qual            7.0
bad_ament_ct            0.0
bldg_sqft            8095.0
btm_5_nbrhd             1.0
good_ament_ct           5.0
neg_ovrll_cond          0.0
outside_sf           1030.0
remodelage              0.0
sgl_famly_hm            0.0
top_3_nbrhd             0.0
total_baths             4.0
two_plus_cr_garg        1.0
Name: 2550, dtype: float64

In [12]:
imported_df.iloc[0, :]


mssubclass            20
mszoning              RH
lotfrontage           80
lotarea            11622
alley                NaN
lotshape             Reg
neighborhood       NAmes
condition1         Feedr
condition2          Norm
bldgtype            1Fam
housestyle        1Story
overallqual            5
overallcond            6
yearbuilt           1961
yearremodadd        1961
exterior1st      VinylSd
exterior2nd      VinylSd
masvnrtype          None
masvnrarea             0
bsmtqual              TA
bsmtcond              TA
bsmtexposure          No
bsmtfintype1         Rec
bsmtfinsf1           468
bsmtfintype2         LwQ
bsmtfinsf2           144
bsmtunfsf            270
totalbsmtsf          882
heatingqc             TA
centralair             Y
electrical         SBrkr
1stflrsf             896
2ndflrsf               0
lowqualfinsf           0
grlivarea            896
bsmtfullbath           0
bsmthalfbath           0
fullbath               1
halfbath               0
bedroomabvgr           2


In [13]:
predict_df["SalePrice"].plot.hist()


<matplotlib.axes._subplots.AxesSubplot at 0x1f8768316c8>

In [14]:
features_df["bldg_sqft"].plot.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x1f8768316c8>