In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


**Guojing Wu** *| 2019-07-25*

<a href = "https://www.kaggle.com/learn/intermediate-machine-learning"> Kaggle: Intermediate Machine Learning </a>

# Objectives

* missing value

* pipelines

* CV

* XGBoost

* leakage

# Missing value (numerical column only)

## drop columns

DataFrame.drop()

## imputation

Fill in with (mean) value

## extension to imputation

Sometimes rows with missing value may be unique, so instead of simply doing imputation, we also add a column indicate whether there is missing value or not

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split

# read the data
X_full = pd.read_csv('drive/My Drive/self_ML/Intermediate ML/train.csv', index_col='Id')
X_test_full = pd.read_csv('drive/My Drive/self_ML/Intermediate ML/test.csv', index_col='Id')

In [0]:
# remove column without outcome
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(axis=1, labels=['SalePrice'], inplace=True)

In [0]:
# to make things simple, I'm only gonna use numerical data
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

In [0]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [32]:
X_train.shape

(1168, 36)

In [33]:
X_train_missing = X_train.isnull().sum()
X_train_missing[X_train_missing > 0]

LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64

Based on the above, we could tell that these three columns have few missing, it would be rediculous to just remove the entire column, so we use imputation.

In [0]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# just a function use simple RF and return the MAE
def score_function(X_train, X_val, y_train, y_val):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return mean_absolute_error(preds, y_val)

In [35]:
# simply just drop columns
cols_missing = [cols for cols in X_train.columns
                if X_train[cols].isnull().any()]
reduced_X_train = X_train.drop(cols_missing, axis=1)
reduced_X_val = X_val.drop(cols_missing, axis=1)

score_function(reduced_X_train, reduced_X_val, y_train, y_val)

17837.82570776256

In [36]:
# imputation
from sklearn.impute import SimpleImputer
# mean here performs worse than drop, so use median
my_imputer = SimpleImputer(strategy='median')
# fit and transform
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
# just transform
imputed_X_val = pd.DataFrame(my_imputer.transform(X_val))
# add column name back
imputed_X_train.columns = X_train.columns
imputed_X_val.columns = X_val.columns

score_function(imputed_X_train, imputed_X_val, y_train, y_val)

17791.59899543379

In [37]:
# some other imputation
final_imputer = SimpleImputer(strategy='median', add_indicator=True)
final_X_train = pd.DataFrame(final_imputer.fit_transform(X_train))
final_X_val = pd.DataFrame(final_imputer.transform(X_val))
final_cols_name = list(X_train.columns) + [names + '_ind' for names in cols_missing]
final_X_train.columns = final_cols_name
final_X_val.columns = final_cols_name
score_function(final_X_train, final_X_val, y_train, y_val)

18063.910194063923

In [0]:
# get a final version for submit
final_model = RandomForestRegressor(n_estimators=100, random_state=0)
final_model.fit(imputed_X_train, y_train)
final_X_test = pd.DataFrame(my_imputer.transform(X_test))
final_X_test.columns = X_test.columns
preds_test = final_model.predict(final_X_test)

In [0]:
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})
output.to_csv('drive/My Drive/self_ML/Intermediate ML/submission.csv', index=False)

# categorical columns

## drop column

In [0]:
# Read the data
X = pd.read_csv('drive/My Drive/self_ML/Intermediate ML/train.csv', index_col='Id') 
X_test = pd.read_csv('drive/My Drive/self_ML/Intermediate ML/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [41]:
# first approach, drop column with categorical data
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])
score_function(drop_X_train, drop_X_valid, y_train, y_valid)

17837.82570776256

## label encoding

But before we do anything, first look at the column 'Condition2'. We could see that on train and valid dataset, 'Condition2' have different level. Simple approach is to remove this column (more complicate solution out there).

In [42]:
X_train['Condition2'].unique()

array(['Norm', 'PosA', 'Feedr', 'PosN', 'Artery', 'RRAe'], dtype=object)

In [43]:
X_valid['Condition2'].unique()

array(['Norm', 'RRAn', 'RRNn', 'Artery', 'Feedr', 'PosN'], dtype=object)

In [46]:
# find the bad column
object_cols = [cols for cols in X_train.columns
               if X_train[cols].dtype == 'object']
bad_label_cols = [cols for cols in object_cols
            if set(X_train[cols]) != set(X_valid[cols])]
good_label_cols = [cols for cols in object_cols
            if set(X_train[cols]) == set(X_valid[cols])]

print('good:', good_label_cols)
print('\n bad:', bad_label_cols)

good: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'BldgType', 'HouseStyle', 'ExterQual', 'CentralAir', 'KitchenQual', 'PavedDrive', 'SaleCondition']

 bad: ['Utilities', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'Functional', 'SaleType']


In [47]:
from sklearn.preprocessing import LabelEncoder

# remove the bad column
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

# label-encoding the good column
label_encoder = LabelEncoder()
for cols in good_label_cols:
    label_X_train[cols] = label_encoder.fit_transform(X_train[cols])
    label_X_valid[cols] = label_encoder.transform(X_valid[cols])

score_function(label_X_train, label_X_valid, y_train, y_valid)

17575.291883561644