### Import Libraries

In [33]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


### Fetch CSV HomeData

In [3]:
home_data = pd.read_csv('train.csv')
 

In [4]:
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

X = home_data[features]

X.head()


Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
0,8450,2003,856,854,2,3,8
1,9600,1976,1262,0,2,3,6
2,11250,2001,920,866,2,3,6
3,9550,1915,961,756,1,3,7
4,14260,2000,1145,1053,2,4,9


In [5]:
y = home_data.SalePrice

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

### Define the Model

In [34]:
rf_model = RandomForestRegressor(max_leaf_nodes=250, random_state=1)
dt_model = DecisionTreeRegressor(random_state=1)
gb_model = GradientBoostingRegressor(random_state=1)

rf_model.fit(train_X, train_y)
dt_model.fit(train_X, train_y)
gb_model.fit(train_X, train_y)


rf_val_predictions = rf_model.predict(val_X)
dt_val_predictions = dt_model.predict(val_X)
gb_val_predictions = gb_model.predict(val_X)

rf_val_rmse = mean_squared_error(rf_val_predictions, val_y)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)
dt_val_mae = mean_absolute_error(dt_val_predictions, val_y)
dt_val_rmse = mean_squared_error(dt_val_predictions, val_y)
gb_val_mae = mean_absolute_error(gb_val_predictions, val_y)
gb_val_msle = mean_squared_log_error(gb_val_predictions, val_y)


print('The Mean Squared Error for rf is: {}'.format(rf_val_rmse))
print('The Mean Squared Error for dt is: {}'.format(dt_val_rmse))
print('The Mean Absolute Error for rf is: {:,.0f}'.format(rf_val_mae))
print('The Mean Absolute Error for dt is: {:,.0f}'.format(dt_val_mae))
print('The Mean Absolute Error for gb is: {:,.0f}'.format(gb_val_mae))
print('The Mean Squared Log Error for gb is: {:,.0f}'.format(gb_val_msle))



The Mean Squared Error for rf is: 981037924.8676301
The Mean Squared Error for dt is: 1745663966.7561643
The Mean Absolute Error for rf is: 21,795
The Mean Absolute Error for dt is: 29,653
The Mean Absolute Error for gb is: 22,110
The Mean Squared Log Error for gb is: 0


## Answer -> Given predictions from train data predict the test data  

In [31]:
test_data = pd.read_csv('test.csv')

test_X = test_data[features]

test_X.head()

test_rf_val_predictions = rf_model.predict(test_X)
test_dt_val_predictions = dt_model.predict(test_X)


# from sklearn.metrics import mean_absolute_error

# mean_squared_error(rf_val_predictions, val_y)

rf_df = pd.DataFrame({'Id': test_data.Id, 'SalePrice': test_rf_val_predictions})

dt_df = pd.DataFrame({'Id': test_data.Id, 'SalePrice': test_dt_val_predictions})


rf_df.to_csv('tim_rf_submission.csv', index=False)
dt_df.to_csv('tim_dt_submission.csv', index=False)



In [32]:
rf_df

Unnamed: 0,Id,SalePrice
0,1461,114136.586224
1,1462,152320.364185
2,1463,182271.300718
3,1464,180336.993583
4,1465,182106.165174
...,...,...
1454,2915,86144.719858
1455,2916,87576.137999
1456,2917,162279.293850
1457,2918,138829.047275


In [12]:
dt_df

Unnamed: 0,Id,SalePrice
0,1461,84000.0
1,1462,155000.0
2,1463,181000.0
3,1464,181000.0
4,1465,180000.0
...,...,...
1454,2915,75000.0
1455,2916,75000.0
1456,2917,134432.0
1457,2918,147000.0


In [29]:

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)


for max_leaf_nodes in [100, 250, 300, 400, 500, 600, 700]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 100  		 Mean Absolute Error:  22040
Max leaf nodes: 250  		 Mean Absolute Error:  21794
Max leaf nodes: 300  		 Mean Absolute Error:  21835
Max leaf nodes: 400  		 Mean Absolute Error:  21886
Max leaf nodes: 500  		 Mean Absolute Error:  21898
Max leaf nodes: 600  		 Mean Absolute Error:  21887
Max leaf nodes: 700  		 Mean Absolute Error:  21890
