In [131]:
import pandas as pd
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

target=train_data.SalePrice

# Drop houses where target is NA
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

# Drop saleprice column
predictors = train_data.drop(['SalePrice'],axis=1)

# Find column with missing values
columns_with_missing = [col for col in train_data.columns if train_data[col].isnull().any()]

columns_with_missing_test = [col for col in test_data.columns if test_data[col].isnull().any()]

train_predictors = train_data.drop(['Id','SalePrice'] + columns_with_missing, axis = 1)
test_predictors = test_data.drop(['Id'] + columns_with_missing_test, axis = 1)

# "cardinality" means the number of unique values in a column.
# We use it as our only way to select categorical columns.

cardinality_cols = [cname for cname in train_predictors.columns 
                    if train_predictors[cname].nunique()<10 and train_predictors[cname].dtype == "object"]

numerical_cols = [cname for cname in train_predictors.columns 
                    if train_predictors[cname].dtype in ['int64', 'float64']]

my_columns =cardinality_cols + numerical_cols

cardinality_cols_test = [cname for cname in test_predictors.columns 
                    if test_predictors[cname].nunique()<10 and test_predictors[cname].dtype == "object"]

numerical_cols_test = [cname for cname in test_predictors.columns 
                    if test_predictors[cname].dtype in ['int64', 'float64']]

my_columns_test =cardinality_cols_test + numerical_cols_test

training_predictors = train_predictors[my_columns]
testing_predictors = test_predictors[my_columns_test]


In [130]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

one_hot_encoded_training_predictors = pd.get_dummies(training_predictors)

def get_mae(X, y):
    # multiple by -1 to make positive MAE score instead of neg value returned as sklearn convention
    return -1 * cross_val_score(RandomForestRegressor(50), 
                                X, y, 
                                scoring = 'neg_mean_absolute_error').mean()

predictors_without_categoricals = training_predictors.select_dtypes(exclude=['object'])

mae_without_categoricals = get_mae(predictors_without_categoricals, target)

mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, target)

print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals)))
print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded)))

Mean Absolute Error when Dropping Categoricals: 18355
Mean Abslute Error with One-Hot Encoding: 17899


**Apply to Multiple Files**

In [132]:
# Align command make sure the columns show up in the same order in both datasets (training_predictors, testing_predictors)
one_hot_encoded_training_predictors = pd.get_dummies(training_predictors)
one_hot_encoded_test_predictors = pd.get_dummies(testing_predictors)

final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
                                                                    join='inner', 
                                                                    axis=1)

**Submission**

In [133]:
my_model = RandomForestRegressor()
my_model.fit(final_train, target)

predicted_prices = my_model.predict(final_test)
# We will look at the predicted prices to ensure we have something sensible.
print(predicted_prices)

[133860. 150330. 171550. ... 135045. 134900. 235128.]


In [134]:
my_submission = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission_ONE.csv', index=False)