In [1]:
import numpy as np
import pandas as pd
import scipy as stats
from scipy.stats import skew
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.metrics import r2_score

In [2]:
ames = pd.read_csv('./datasets/train.csv')
test = pd.read_csv('./datasets/test.csv')

ames.columns = ames.columns.str.lower().str.replace(" ","_")
test.columns = test.columns.str.lower().str.replace(" ","_")

### Perform all transformations on the test data that you did to clean training data

In [3]:
ames.shape, test.shape

((2051, 81), (878, 80))

In [4]:
test.mas_vnr_type.value_counts()

None       534
BrkFace    250
Stone       80
BrkCmn      12
CBlock       1
Name: mas_vnr_type, dtype: int64

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               878 non-null    int64  
 1   pid              878 non-null    int64  
 2   ms_subclass      878 non-null    int64  
 3   ms_zoning        878 non-null    object 
 4   lot_frontage     718 non-null    float64
 5   lot_area         878 non-null    int64  
 6   street           878 non-null    object 
 7   alley            58 non-null     object 
 8   lot_shape        878 non-null    object 
 9   land_contour     878 non-null    object 
 10  utilities        878 non-null    object 
 11  lot_config       878 non-null    object 
 12  land_slope       878 non-null    object 
 13  neighborhood     878 non-null    object 
 14  condition_1      878 non-null    object 
 15  condition_2      878 non-null    object 
 16  bldg_type        878 non-null    object 
 17  house_style     

In [6]:
#cols needing transform from (ordinal) object to numeric
nans_to_zeros = ['bsmt_qual','bsmt_cond', 'fireplace_qu','garage_qual', 'garage_cond',
                 'kitchen_qual', 'exter_qual','exter_cond','heating_qc','garage_finish', 
                 'bsmt_exposure','bsmtfin_type_1','bsmtfin_type_2', 'pool_qc', 'fence']

nans_as_nans = ['lot_shape','utilities','land_slope', 'electrical','functional']

In [7]:
to_num_masterdict={'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5, #nans_to_zeros cols.
'Unf':1, 'RFn':2, 'Fin':3, #garage_finish
'MnWw':1, 'GdWo':2, 'MnPrv':3, 'GdPrv':4, #fence
'No':1, 'Mn':2, 'Av':3, 'Gd':4,#bsmt_exposure
'Unf':1, 'LwQ':2,'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6, #bsmtfin_type_1 and 2
'IR3':1, 'IR2':2, 'IR1':3, 'Reg':4, #lot_shape
'ELO':1, 'NoSeWa':2, 'NoSewr':3, 'Allpub':4, #utilities
'Sev':1, 'Mod':2, 'Gtl':3, #land_slope
'Sal':0, 'Sev':1, 'Maj2':2, 'Maj1':3, 'Mod':4, 'Min2':5, 'Min1':6, 'Typ':7, #functional
'Mix':1, 'FuseP':2, 'FuseF':3, 'FuseA':4, 'SBrkr':5} #electrical

#### Reclassify: ordinals to numeric, some numerics to categorical, some to binary, nans to 0s

In [8]:
#conversion function
def to_nums_w_zeros(df, targetcols):
    for i in targetcols:
        df[i] = df[i].map(to_num_masterdict)
        df[i] = df[i].fillna(0)
        #print(df[targetcols].value_counts()) #this line to check work, from source1

In [9]:
to_nums_w_zeros(ames, nans_to_zeros)
to_nums_w_zeros(test, nans_to_zeros)

In [10]:
to_nums_w_zeros(ames, nans_as_nans)
to_nums_w_zeros(test, nans_as_nans)

In [11]:
to_obj_masterdict={20:'SC20', 30:'SC30', 40:'SC40', 45:'SC45', 50:'SC50', 60:'SC60',
                 70:'SC70', 75:'SC75', 80:'SC80', 85:'SC85', 90:'SC90', 120:'SC120',
                  150:'SC150', 160:'SC160', 180:'SC180', 190:'SC190', #ms_subclass
                  1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul',
                  8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'} #months

In [12]:
should_be_objs= ['ms_subclass','mo_sold']
def num_to_obj(df, targetcols):
    for i in targetcols:
        df[i] = df[i].map(to_obj_masterdict)

In [13]:
num_to_obj(ames, should_be_objs)
num_to_obj(test, should_be_objs)

In [14]:
def nans2none_nominal(df, targetcols):
    for i in targetcols:
        df[i] = df[i].fillna('None')

In [15]:
nominal_nans = ['alley','misc_feature','garage_type','mas_vnr_type']
nans2none_nominal(ames, nominal_nans)
nans2none_nominal(test, nominal_nans)

In [16]:
fill0_continuous_vars = ['mas_vnr_area','bsmt_half_bath','bsmt_full_bath','bsmtfin_sf_1',
                         'bsmtfin_sf_2','total_bsmt_sf','bsmt_unf_sf','garage_cars','garage_area']

def discrete_fillnas(df, targetcols):
    for i in targetcols:
        df[i] = df[i].fillna(0)

In [17]:
discrete_fillnas(ames, fill0_continuous_vars)
discrete_fillnas(test, fill0_continuous_vars)

In [18]:
ames['central_air'] = ames['central_air'].map({'Y':1, 'N':0})
ames['paved_drive'] = ames['paved_drive'].map({'Y':1, 'P':1, 'N':0})
ames['mas_vnr_type'] = ames['mas_vnr_type'].map({'BrkFace':1, 'Stone':1, 'BrkCmn':1,
                                                'CBlock':1, 'None':0})

test['central_air'] = test['central_air'].map({'Y':1, 'N':0})
test['paved_drive'] = test['paved_drive'].map({'Y':1, 'P':1, 'N':0})
test['mas_vnr_type'] = test['mas_vnr_type'].map({'BrkFace':1, 'Stone':1, 'BrkCmn':1,
                                                'CBlock':1, 'None':0})

#### Drop columns

In [19]:
cols_to_drop=['pid','alley','misc_feature','misc_val','garage_yr_blt','lot_frontage',
              'fence', 'pool_qc','pool_area','mas_vnr_area','bsmtfin_type_2','bsmt_unf_sf',
              'fireplaces','garage_cars','garage_cond','3ssn_porch','screen_porch']

In [20]:
ames.drop(columns=cols_to_drop,inplace=True)
test.drop(columns=cols_to_drop,inplace=True)

#### Log transform skewed numerical vars

In [21]:
#log transform saleprice for ames only
ames['saleprice'] = np.log1p(ames['saleprice'])

In [22]:
#log transform all numerical vars with skew >.5
#no need to drop saleprice in test (already DNE)
numerical = ames.select_dtypes(exclude = ["object"]).columns
categorical = ames.select_dtypes(include = ["object"]).columns
ames_cats= ames[categorical]

In [23]:
ames_nums = ames[numerical].drop(columns=['saleprice'])

In [24]:
t_numerical = test.select_dtypes(exclude = ["object"]).columns
t_categorical = test.select_dtypes(include = ["object"]).columns
test_nums= test[t_numerical]
test_cats= test[t_categorical]

In [25]:
#find skewed numeric features.  Code from source 3, and 4
#"general rule of thumb, abv(skew)>.5 is 'moderately skewed'" - source 3
allskew = ames_nums.apply(lambda x: skew(x))
allskew = allskew[abs(allskew)>0.5]

t_allskew = test_nums.apply(lambda x: skew(x))
t_allskew = t_allskew[abs(t_allskew)>0.5] 

In [26]:
allskew_features = allskew.index #source 3
ames_nums[allskew_features] = np.log1p(ames_nums[allskew_features])

In [27]:
t_allskew_features = t_allskew.index
test_nums[t_allskew_features] = np.log1p(test_nums[t_allskew_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [41]:
test_nums.head()

Unnamed: 0,id,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_type,...,functional,fireplace_qu,garage_finish,garage_area,garage_qual,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,yr_sold
0,2658,9.120744,1.609438,0.0,1.386294,6,8,7.555382,1950,0,...,2.079442,0.0,1.0,440,0.693147,0.693147,0.0,4.110874,4.727388,2006
1,2718,9.176059,1.386294,0.0,1.386294,5,4,7.589842,1977,0,...,2.079442,0.0,3.0,580,1.386294,0.693147,5.141664,0.0,0.0,2006
2,2414,9.747126,1.386294,0.0,1.386294,7,5,7.604396,2006,0,...,2.079442,4.0,2.0,426,1.386294,0.693147,4.615121,3.218876,0.0,2006
3,1989,9.050289,1.609438,0.0,1.386294,5,6,7.562162,2006,0,...,2.079442,0.0,1.0,480,1.098612,0.0,0.0,0.0,5.220356,2007
4,625,9.159152,1.386294,0.0,1.386294,6,5,7.582738,1963,1,...,2.079442,4.0,2.0,514,1.386294,0.693147,0.0,4.343805,0.0,2009


#### Dummify

In [28]:
ames_cats = pd.get_dummies(ames_cats, drop_first=True)
test_cats = pd.get_dummies(test_cats, drop_first=True)

In [29]:
train_log_dum = pd.concat([ames_nums, ames_cats, ames['saleprice']], axis=1)
test_log_dum = pd.concat([test_nums, test_cats], axis=1)

### FIT: X = vars with > .3 correlation

In [30]:
train_corr = train_log_dum.corr()
cols_corrs_over3 = train_corr['saleprice'].sort_values(ascending = False).loc[train_corr['saleprice']>.3].drop('saleprice').index

y = train_log_dum['saleprice']
X = train_log_dum[cols_corrs_over3]
X_test_sub = test_log_dum[cols_corrs_over3]

In [31]:
lr = LinearRegression()

In [32]:
lr.fit(X,y)

LinearRegression()

In [33]:
test_preds = lr.predict(X_test_sub)

In [34]:
unlog_predictions = np.expm1(test_preds)

In [35]:
kaggle_submission = {
    'Id' : test['id'],
    'SalePrice': unlog_predictions
}
kaggle_submission= pd.DataFrame(kaggle_submission)

In [36]:
kaggle_submission

Unnamed: 0,Id,SalePrice
0,2658,1.149955e+06
1,2718,9.639579e+06
2,2414,1.194941e+06
3,1989,8.437692e+05
4,625,3.359335e+06
...,...,...
873,1662,1.760065e+06
874,1234,2.580141e+06
875,1373,1.254005e+06
876,1672,7.198005e+05


In [37]:
np.expm1(train_log_dum.saleprice.mean())

166774.4350451321

In [38]:
kaggle_submission.SalePrice.mean()

3623311.457420508

In [39]:
#kaggle_submission1.to_csv('./datasets/kag_submission1_corr_over_3.csv', index=False)