In [150]:
import pandas as pd
from sklearn.model_selection import train_test_split
X_full = pd.read_csv("./home-data-for-ml-course/train.csv", index_col="Id")
X_test_full = pd.read_csv("./home-data-for-ml-course/test.csv", index_col="Id")

# remove rows with no y values
X_full.dropna(axis=0,subset=['SalePrice'],inplace=True)
y = X_full['SalePrice']
# drop SalePrice column from predictors
X_full.drop(['SalePrice'],axis=1,inplace=True)

# keep only numerical
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

# split training data set
X_train, X_valid, y_train, y_valid = train_test_split(X,y,
                                                    train_size=0.8, test_size=0.2,
                                                    random_state=0)


In [151]:
print(X_train.shape)
X_train.describe()

(1168, 36)


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1168.0,956.0,1168.0,1168.0,1168.0,1168.0,1168.0,1162.0,1168.0,1168.0,...,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0
mean,56.605308,69.614017,10589.672945,6.086473,5.572774,1970.890411,1984.692637,103.481067,439.890411,45.571918,...,473.632705,94.498288,48.044521,23.02226,3.218322,14.528253,2.118151,50.936644,6.30137,2007.819349
std,42.172322,22.946069,10704.180793,1.367472,1.116908,30.407486,20.684612,182.676225,435.106803,156.229962,...,209.44232,127.312017,68.619199,63.153093,27.916593,54.009608,36.482294,550.380636,2.725977,1.335971
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,59.0,7589.5,5.0,5.0,1953.75,1966.0,0.0,0.0,0.0,...,336.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,50.0,69.0,9512.5,6.0,5.0,1972.0,1993.0,0.0,379.5,0.0,...,477.5,0.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,167.75,716.0,0.0,...,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,2260.0,1120.0,...,1390.0,736.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0


In [152]:
# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(type(missing_val_count_by_column))
print(missing_val_count_by_column[missing_val_count_by_column > 0])

<class 'pandas.core.series.Series'>
LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


In [153]:
num_cols_with_missing_val = (X_train.isnull().sum() > 0).sum()
num_cols_with_missing_val 

3

In [154]:
import numpy as np
test_series = pd.Series([0,np.nan,None,"",1])
print(test_series)
print("*"*10)
print(test_series.isnull())
print("*"*10)
print(test_series.isna())

0       0
1     NaN
2    None
3        
4       1
dtype: object
**********
0    False
1     True
2     True
3    False
4    False
dtype: bool
**********
0    False
1     True
2     True
3    False
4    False
dtype: bool


In [155]:
tot_missing = X_train.isnull().sum().sum()
tot_missing

276

In [156]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid,y_train,y_valid):
    model = RandomForestRegressor(n_estimators=100,random_state=0)
    model.fit(X_train,y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

### Test dropping columns with missing values

In [157]:
# test drop columns with missing values
cols_with_missing_vals = [col for col in X_train.columns
                          if X_train[col].isnull().any()]

X_train_drop_cols = X_train.drop(cols_with_missing_vals, axis=1)
X_valid_drop_cols = X_valid.drop(cols_with_missing_vals, axis=1)

print(X_train_drop_cols.shape, y_train.shape)
print("MAE (Drop columns with missing values):")
print(score_dataset(X_train_drop_cols, X_valid_drop_cols, y_train, y_valid))


(1168, 33) (1168,)
MAE (Drop columns with missing values):
17837.82570776256


### Test dropping rows with missing values

In [158]:
def drop_rows_with_missing_vals(X_train,X_valid, y_train,y_valid):
    # which rows have missing values
    X_train_rows_to_drop = X_train[X_train.isnull().any(axis=1)].index.values
    X_valid_rows_to_drop = X_valid[X_valid.isnull().any(axis=1)].index.values

    # which rows to keep
    training_rows_to_keep = [r for r in X_train.index.values
                            if r not in X_train_rows_to_drop]
    validation_rows_to_keep = [r for r in X_valid.index.values
                                if r not in X_valid_rows_to_drop]

    # remove rows form features and predictors
    final_x_train = X_train.loc[training_rows_to_keep]
    final_y_train = y_train.loc[training_rows_to_keep]

    final_x_valid = X_valid.loc[validation_rows_to_keep]
    final_y_valid = y_valid.loc[validation_rows_to_keep]
    return final_x_train, final_x_valid, final_y_train,final_y_valid

In [159]:
print("MAE (Drop rows with missing values):")
X_train_drop_rows, X_valid_drop_rows, y_train_drop_rows,y_valid_drop_rows  = drop_rows_with_missing_vals(
    X_train,X_valid,y_train,y_valid
)
print(score_dataset(X_train_drop_rows, X_valid_drop_rows, y_train_drop_rows, y_valid_drop_rows))

MAE (Drop rows with missing values):
19872.120358744392


### Test imputing values

In [160]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Fill in the lines below: imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print(imputed_X_train.shape, y_train.shape,imputed_X_valid.shape, y_valid.shape)

(1168, 36) (1168,) (292, 36) (292,)


In [161]:
print("MAE (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE (Imputation):
18062.894611872147


### Generate Test Predictions

- preprocess training and validation features (deal with missing values)
- train and evaluate random forest model
- preprocess test data and generate predictions


In [162]:
imputer = SimpleImputer(strategy='mean')
final_X_train = pd.DataFrame(imputer.fit_transform(X_train.copy()))
final_X_valid = pd.DataFrame(imputer.fit_transform(X_valid.copy()))

# Imputation removed column names; put them back
final_X_train.columns = X_train.columns
final_X_valid.columns = X_valid.columns

print(final_X_train.shape, final_X_valid.shape,y_train.shape,y_valid.shape)



(1168, 36) (292, 36) (1168,) (292,)


In [163]:
print(final_X_train.isnull().sum(axis=0).sum())
print(final_X_valid.isnull().sum(axis=0).sum())
print(y_train.isnull().sum())
print(y_valid.isnull().sum())

0
0
0
0


In [164]:
final_model = RandomForestRegressor(n_estimators=100,random_state=0)
final_model.fit(final_X_train,y_train)
predictions =  final_model.predict(final_X_valid)
print(mean_absolute_error(y_valid, predictions))

18056.85163242009


In [165]:
#preprocess test data
X_test_copy = X_test.copy()
# Fill in the line below: preprocess test data
final_X_test = pd.DataFrame(imputer.fit_transform(X_test_copy))
final_X_test.columns = X_test_copy.columns
print(final_X_test.isnull().sum().sum())
print(final_X_test.shape, X_test.shape)

0
(1459, 36) (1459, 36)


In [166]:
# Get model predictions on test data ...
final_predictions = final_model.predict(final_X_test)
print(type(final_predictions))
final_predictions

<class 'numpy.ndarray'>


array([125245.5 , 155237.  , 180755.22, ..., 154283.87, 107723.5 ,
       228591.59])

In [167]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': final_predictions})
output.to_csv('submission-dealt-with-missing-vals.csv', index=False)