# Machine Learning Part 1 tutorial


In [1]:
import pandas as pd

main_file_path = '../input/train.csv' # this is the path to the Iowa data that you will use
data = pd.read_csv(main_file_path)

print(data.mean()['LotArea'])


# Select predictors for model

In [2]:
# melbourne_predictors = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 
#                        'YearBuilt', 'Lattitude', 'Longtitude']
columns_of_interest = ['LotArea',
'LotFrontage',
'YearBuilt',
'1stFlrSF',
'2ndFlrSF',
'FullBath',
'BedroomAbvGr',
'TotRmsAbvGrd',
'MSSubClass',
'YrSold']
#columns_of_interest = ['MSSubClass','LotFrontage','LotArea','YearBuilt',
#                      'OverallQual','TotRmsAbvGrd']
# columns_of_interest = ['LotArea', 'LotShape']
X = data[columns_of_interest]
y = data.SalePrice
#print(data.iloc[1:3,1:7])
#print(data.iloc[1:3,8:14])
#print(data.iloc[1:3,15:20])
#print(data.iloc[1:3,21:25])
print(data.columns)
#X.fillna(value=0, inplace=True)
#X.isna()
X.describe()


# Impute missing values

In [3]:
from sklearn.impute import SimpleImputer

X_original = X.copy()
my_imputer = SimpleImputer(missing_values='NaN', strategy='mean')
#print(X.columns)
X = pd.DataFrame(my_imputer.fit_transform(X),columns=X.columns)  

# make new columns indicating what will be imputed
# cols_with_missing = (col for col in X.columns() 
#                                 if X[c].isnull().any())
# for col in cols_with_missing:
#     X[col + '_was_missing'] = X[col].isnull()

# Imputation
#my_imputer = Imputer()
#X = my_imputer.fit_transform(X)


# Create DecisionTree

In [4]:
from sklearn.tree import DecisionTreeRegressor

# Define model
model = DecisionTreeRegressor()

# Fit model
model.fit(X, y)

In [5]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(model.predict(X.head()))

# Test model accuracy

In [6]:
from sklearn.metrics import mean_absolute_error

predicted_home_prices = model.predict(X)
mean_absolute_error(y, predicted_home_prices)

# Repeat Model splitting into test dataset

In [7]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both predictors and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

## Determine best MAE based on max leaf nodes

In [8]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

In [9]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

# Create RandomForest model

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

# Test model for Kaggle submission

In [116]:
# Read the test data
test = pd.read_csv('../input/test.csv')

# Treat the test data in the same way as training data. In this case, pull same columns.
test_X = test[columns_of_interest]

#print(X.columns)
train_X = pd.DataFrame(my_imputer.fit_transform(train_X),columns=train_X.columns)  
test_X = pd.DataFrame(my_imputer.fit_transform(test_X),columns=test_X.columns)  

# Use the model to make predictions
predicted_prices = forest_model.predict(test_X)
# We will look at the predicted prices to ensure we have something sensible.
print(predicted_prices)

# XG Boost

In [131]:
from xgboost import XGBRegressor

data = pd.read_csv('../input/train.csv')
data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)

xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# Add silent=True to avoid printing out updates with each cycle
xgb_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(test_X, test_y)],verbose=False)

#my_imputer = SimpleImputer(missing_values='NaN', strategy='mean')
#train_X = pd.DataFrame(my_imputer.fit_transform(train_X),columns=train_X.columns)  

# make predictions
#xgb_predictions = xgb_model.predict(test_X)

#from sklearn.metrics import mean_absolute_error
#print("Mean Absolute Error : " + str(mean_absolute_error(xgb_predictions, val_y)))


In [132]:
test = pd.read_csv('../input/test.csv')
print(test.shape)
# Treat the test data in the same way as training data. In this case, pull same columns.
test_X = test #[columns_of_interest]
test_X = test.select_dtypes(exclude=['object'])

#print(test_X.columns)
print(test_X.shape)
#test_X = pd.DataFrame(my_imputer.fit_transform(test_X),columns=test_X.columns)  
print(test_X.shape)

# Use the model to make predictions
xgb_predicted_prices = xgb_model.predict(test_X)
print(xgb_predicted_prices)


# One hot encoding

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

def get_mae(X, y):
    # multiple by -1 to make positive MAE score instead of neg value returned as sklearn convention
    return -1 * cross_val_score(RandomForestRegressor(50), 
                                X, y, 
                                scoring = 'neg_mean_absolute_error').mean()

# print(train_X.dtypes.sample(10))
one_hot_encoded_training_predictors = pd.get_dummies(train_X)
predictors_without_categoricals = train_X.select_dtypes(exclude=['object'])

mae_without_categoricals = get_mae(predictors_without_categoricals, train_y)

mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, train_y)

print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals)))
print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded)))

one_hot_encoded_test_predictors = pd.get_dummies(test_X)
print(test_X.head())
print(one_hot_encoded_test_predictors.head())
final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
                                                                    join='left', 
                                                                    axis=1)
forest_model = RandomForestRegressor()
forest_model.fit(one_hot_encoded_training_predictors, train_y)

# Use the model to make predictions
predicted_prices = forest_model.predict(final_test)
# We will look at the predicted prices to ensure we have something sensible.
print(predicted_prices)

In [133]:
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': xgb_predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)