In [1]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

 
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [2]:
#GOAL: Predict Housing Sales Prices (SalePrice)

train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


**Part 1: Feature Engineering**

In [4]:
####DROP FEAUTURES WITH MAJORITY OF ELEMENTS MISSING
majority_missing = []
for col in train.columns:
    if train[col].isnull().sum() >(train.shape[0])/2:
        majority_missing.append(col)
    
reduced_train = train.drop(majority_missing, axis=1)
reduced_test = test.drop(majority_missing, axis=1)



In [5]:
test_nan = reduced_test.isnull().sum().sort_values(ascending = False) 
len(test_nan[test_nan>=1])


29

In [6]:
#lets merge test and train to handle nan values at same time

df_all = pd.concat([reduced_train,reduced_test ],ignore_index=True)
df_all.shape, reduced_train.shape, reduced_test.shape #NOTE: first 1460 rows from training data last 1459 from test 

((2919, 77), (1460, 77), (1459, 76))

In [7]:
# replace all the categorical variable with their mode value and numerical variables with their median
df_all["Electrical"].dtype, df_all["SalePrice"].dtype # so categorical variables would have a data type of "dtype(0)" and integers "dtype('float64')"

(dtype('O'), dtype('float64'))

In [8]:
has_null = df_all.isnull().sum()
has_null[has_null>0]

MSZoning           4
LotFrontage      486
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        24
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinSF1         1
BsmtFinType2      80
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageCars         1
GarageArea         1
GarageQual       159
GarageCond       159
SaleType           1
SalePrice       1459
dtype: int64

In [9]:
# to find categorical variable with missing features
str_missing = []
for col in df_all:
    if df_all[col].dtype == df_all["Electrical"].dtype:
        if df_all[col].isnull().sum() >0:
            str_missing.append(col)
str_missing

['MSZoning',
 'Utilities',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'SaleType']

In [10]:
#in the data description for FireplaceQu Na means it does not have that feature
df_all['FireplaceQu'] = df_all['FireplaceQu'].fillna('None')

In [11]:
 # this will give us a list of all the columns that non nan inputs are a string since 

for col in df_all:
    if df_all[col].dtype == df_all["Electrical"].dtype:
        if df_all[col].isnull().sum() >0:
            df_all[col] = df_all[col].fillna(df_all[col].mode()[0])
            


In [12]:

for col in df_all:
    if df_all[col].dtype == df_all['GarageArea'].dtype and col != 'SalePrice':
        if df_all[col].isnull().sum() >0:
            mean = df_all[col].mean()
            df_all[col] = df_all[col].fillna(mean)
            

In [13]:
def categorical(data):
    cat_var = []
    for col in data:
        if df_all[col].dtype == df_all["Electrical"].dtype:
            cat_var.append(col)
    return cat_var        

In [14]:
#now we need to hand categorical variables
cat_var = categorical(df_all)
df_all = pd.get_dummies(df_all, columns = cat_var, drop_first = True)
        

In [15]:
#As noted previously first 1460 columns (inclusive) are from the training data and rest are from test data
train_final = df_all.iloc[:1460,:]
test_final = df_all.iloc[1460:,:]
test_final = test_final.drop('SalePrice',axis=1) #since test data originally did not have the SalePrice feature, this is what we are estimating


In [16]:
#Check that no more columns with missing values
train_final.isnull().sum().sort_values(ascending = True), test_final.isnull().sum().sort_values(ascending = True)

(Id                       0
 ExterQual_Gd             0
 ExterQual_TA             0
 ExterCond_Fa             0
 ExterCond_Gd             0
                         ..
 Condition1_RRAe          0
 Condition1_RRAn          0
 Condition1_RRNe          0
 Neighborhood_Sawyer      0
 SaleCondition_Partial    0
 Length: 239, dtype: int64,
 Id                       0
 ExterQual_TA             0
 ExterCond_Fa             0
 ExterCond_Gd             0
 ExterCond_Po             0
                         ..
 Condition1_RRAn          0
 Condition1_RRNe          0
 Condition1_RRNn          0
 Neighborhood_SawyerW     0
 SaleCondition_Partial    0
 Length: 238, dtype: int64)

**Part 2: Fit the model** 

In [17]:
#from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb


In [18]:
model_xgb = xgb.XGBRegressor(learning_rate = 0.11)
Y= train_final['SalePrice']
X= train_final.drop('SalePrice', axis = 1)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state = 0)
model_xgb.fit(X_train,Y_train)


XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.11, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)

In [19]:
xgb_pred = model_xgb.predict(X_test)
r2_score(Y_test, xgb_pred)

0.8626743872704462

In [20]:

# Use the model to make predictions
predicted_prices = model_xgb.predict(test_final)
# We will look at the predicted prices to ensure we have something sensible.
print(predicted_prices)

[125772.65 153846.16 186881.25 ... 162324.19 109705.82 231796.45]


In [21]:
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)