## Testing Decision tree and random forest algorithm on housing_price_iowa data

### Testing the data without specifying number  of leaves

In [5]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Path of the file to read
iowa_file_path = 'housing_price_iowa.csv'

#import the data
home_data = pd.read_csv(iowa_file_path)
home_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
#Make the SalePrice the target variable that is the dependent varaible 
y = home_data.SalePrice

#Select the columns for the features that is indendent variable 
feature_columns = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[feature_columns]

#Split the data to train and text set 
train_X, test_X, train_y, test_y = train_test_split(X,y, random_state = 1)

# Using Decision tree regression model 
deci_tree_reg = DecisionTreeRegressor(random_state=1)

# Fit and train the Model
deci_tree_reg.fit(train_X, train_y)

#predicting the test data with the model
y_pred_test = deci_tree_reg.predict(test_X)
mae = mean_absolute_error(test_y, y_pred_test)
print("mean_absolute_error when we did not specifying max_leaf_nodes: {}".format(mae))

mean_absolute_error when we did not specifying max_leaf_nodes: 29652.931506849316


In [15]:
#checking the performance of the model 
deci_tree_y_pred_df = pd.DataFrame({'Expected y':test_y, 'Predicted y':y_pred_test })
deci_tree_y_pred_df.head(20)

Unnamed: 0,Expected y,Predicted y
258,231500,186500.0
267,179500,184000.0
288,122000,130000.0
649,84500,92000.0
1233,142000,164500.0
167,325624,220000.0
926,285000,335000.0
831,151000,144152.0
1237,195000,215000.0
426,275000,262000.0


### Testing the data with different number of leaves 

In [11]:
#Calculating the error 
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    
    # Using decision regresion Model
    best_deci_tree_reg = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state = 0)
    # Fit Model
    best_deci_tree_reg.fit(train_X, train_y)

    #make predictions with the test set 
    best_y_pred = best_deci_tree_reg.predict(test_X)
      
    #calculate error 
    mae = mean_absolute_error(test_y,best_y_pred)
   
    return mae

In [12]:
max_leaf_nodes =[5, 25, 50, 100, 200,250, 500] 
scores = []

#Check the best leaf node to use 
for max_leaf_node in max_leaf_nodes:
    val_mae = get_mae(max_leaf_node, train_X, test_X, train_y, test_y)
    scores.append(val_mae)
    print("mean_absolute_error for nodes {} is {}".format(max_leaf_node, val_mae))
    
best_tree_score_index = scores.index(min(scores))   
print("\nBest_tree_size is {} ".format(max_leaf_nodes[best_tree_score_index]))    

mean_absolute_error for nodes 5 is 35044.51299744237
mean_absolute_error for nodes 25 is 29016.41319191076
mean_absolute_error for nodes 50 is 27405.930473214907
mean_absolute_error for nodes 100 is 27282.50803885739
mean_absolute_error for nodes 200 is 28135.69164341533
mean_absolute_error for nodes 250 is 27893.822225701646
mean_absolute_error for nodes 500 is 29454.18598068598

Best_tree_size is 100 


## Random forest 
The random forest uses many trees, and it makes a prediction by averaging the predictions of each component tree. 

In [13]:
from sklearn.ensemble import RandomForestRegressor

# Define the model- use random forest regressor model
rf_reg = RandomForestRegressor(random_state=1)

# fit your model
rf_reg.fit(train_X, train_y)
#make predictions
rf_predictions = rf_reg.predict(test_X)
# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(test_y,rf_predictions)

print("mean_absolute_error for Random Forest Model: {}".format(rf_val_mae))


mean_absolute_error for Random Forest Model: 22883.425753424657


In [16]:
#checking the performance of the model 
rf_y_pred_df = pd.DataFrame({'Expected y':test_y, 'Predicted y':rf_predictions })
rf_y_pred_df.head(20)

Unnamed: 0,Expected y,Predicted y
258,231500,190400.0
267,179500,147170.0
288,122000,133830.0
649,84500,85740.0
1233,142000,153550.0
167,325624,267125.3
926,285000,335364.3
831,151000,149489.3
1237,195000,224362.5
426,275000,217190.0


## Random forest seems to be the best model for this dataset
