In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor


data = pd.read_csv("train.csv")
y = data["SalePrice"]


features = ["LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr"]
X = data[features]


model = DecisionTreeRegressor(random_state=1)
model.fit(X, y)


predictions = model.predict(X)


print("Predictions: ", predictions[:5])
print("Actual:      ", y.head().values)


Predictions:  [208500. 181500. 223500. 140000. 250000.]
Actual:       [208500 181500 223500 140000 250000]


VALIDATING THE MODEL ( MEAN ABSOLUTE ERROR , TRAIN_TEST_SPLIT) 


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1)

model = DecisionTreeRegressor(random_state=1)
model.fit(train_X,train_y)

val_predictions = model.predict(val_X)

val_mae = mean_absolute_error(val_y,val_predictions)
print(" Validation MAE :", val_mae)


 Validation MAE : 28548.76712328767


Tuning Model Complexity – Handling Overfitting & Underfitting

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error


data = pd.read_csv("train.csv")

y = data["SalePrice"]
features = ["LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr"]
X = data[features]


train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Step 4: Function to get MAE for different tree sizes
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return mae

# Step 5: Try different values for max_leaf_nodes
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
mae_values = {}

print("Tuning tree sizes...\n")
for max_leaf_nodes in candidate_max_leaf_nodes:
    mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    mae_values[max_leaf_nodes] = mae
    print(f"Max leaf nodes: {max_leaf_nodes} \t MAE: {mae:.2f}")

# Step 6: Choose the best tree size
best_tree_size = min(mae_values, key=mae_values.get)
print(f"\n✅ Best tree size: {best_tree_size} (Lowest MAE: {mae_values[best_tree_size]:.2f})")

# Step 7: Train final model on full data
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
final_model.fit(X, y)

print("\n🎯 Final model trained on full data.")


Tuning tree sizes...

Max leaf nodes: 5 	 MAE: 35044.51
Max leaf nodes: 25 	 MAE: 28903.02
Max leaf nodes: 50 	 MAE: 26642.62
Max leaf nodes: 100 	 MAE: 25410.89
Max leaf nodes: 250 	 MAE: 26346.04
Max leaf nodes: 500 	 MAE: 27223.88

✅ Best tree size: 100 (Lowest MAE: 25410.89)

🎯 Final model trained on full data.
