In [1]:
import pandas as pd
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

target=train_data.SalePrice

# Drop houses where target is NA
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

# Drop saleprice column
predictors = train_data.drop(['SalePrice'],axis=1)

# Find column with missing values
columns_with_missing = [col for col in train_data.columns if train_data[col].isnull().any()]

columns_with_missing_test = [col for col in test_data.columns if test_data[col].isnull().any()]

train_predictors = train_data.drop(['Id','SalePrice'] + columns_with_missing, axis = 1)
test_predictors = test_data.drop(['Id'] + columns_with_missing_test, axis = 1)

# "cardinality" means the number of unique values in a column.
# We use it as our only way to select categorical columns.

cardinality_cols = [cname for cname in train_predictors.columns 
                    if train_predictors[cname].nunique()<10 and train_predictors[cname].dtype == "object"]

numerical_cols = [cname for cname in train_predictors.columns 
                    if train_predictors[cname].dtype in ['int64', 'float64']]

my_columns =cardinality_cols + numerical_cols

cardinality_cols_test = [cname for cname in test_predictors.columns 
                    if test_predictors[cname].nunique()<10 and test_predictors[cname].dtype == "object"]

numerical_cols_test = [cname for cname in test_predictors.columns 
                    if test_predictors[cname].dtype in ['int64', 'float64']]

my_columns_test =cardinality_cols_test + numerical_cols_test

training_predictors = train_predictors[my_columns]
testing_predictors = test_predictors[my_columns_test]

In [2]:
# Align command make sure the columns show up in the same order in both datasets (training_predictors, testing_predictors)
one_hot_encoded_training_predictors = pd.get_dummies(training_predictors)
one_hot_encoded_test_predictors = pd.get_dummies(testing_predictors)

final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
                                                                    join='inner', 
                                                                    axis=1)

**We are using modelling process that uses an imputer to fill in missing values, followed by a RandomForestRegressor.**
**These can be bundled together in pipeline**

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer

my_pipeline = make_pipeline(Imputer(), RandomForestRegressor())

In [16]:
my_pipeline.fit(final_train, target)
predictions = my_pipeline.predict(final_test)

In [5]:
my_submission = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predictions})
# you could use any filename. We choose submission here
my_submission.to_csv('submission_pipeline.csv', index=False)

**Cross Validation**

In [17]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(my_pipeline, final_train, target, scoring='neg_mean_absolute_error')
print(scores)

[-20101.31129363 -20007.02176591 -22654.40308642]


In [18]:
print('Mean Absolute Error %2f' %(-1 * scores.mean()))


Mean Absolute Error 20920.912049
