In [5]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [6]:
path = '/content/sample_data/california_housing_train.csv'
houses_data = pd.read_csv(path)
houses_data.dropna(axis=0)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.40,19.0,7650.0,1901.0,1129.0,463.0,1.8200,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.9250,65500.0
...,...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52.0,2217.0,394.0,907.0,369.0,2.3571,111400.0
16996,-124.27,40.69,36.0,2349.0,528.0,1194.0,465.0,2.5179,79000.0
16997,-124.30,41.84,17.0,2677.0,531.0,1244.0,456.0,3.0313,103600.0
16998,-124.30,41.80,19.0,2672.0,552.0,1298.0,478.0,1.9797,85800.0


In [7]:
y = houses_data.median_house_value
features = [
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income'];
X = houses_data[features]

In [8]:
# splitting the data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

**Using Decision Tree**

In [9]:
# creating a model
model = DecisionTreeRegressor(random_state=1)
model.fit(train_X, train_y)

In [10]:
# calculating mean absolute error
predictions = model.predict(val_X);
mae = mean_absolute_error(predictions, val_y)
mae

65730.78164705883

In [11]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
  model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
  model.fit(train_X, train_y)
  predictions = model.predict(val_X);
  mae = mean_absolute_error(predictions, val_y)
  return mae

In [15]:
# an array of different leaf nodes
max_leaf_nodes_arr = [5, 25, 50, 100, 250, 500]

scores = {i: get_mae(i, train_X, val_X, train_y, val_y) for i in max_leaf_nodes_arr}
best_tree_size = min(scores, key=scores.get)
best_tree_size

250

In [16]:
# 250 is the best value of max_leaf_nodes for the model
improved_model = DecisionTreeRegressor(max_leaf_nodes=250, random_state=1)
improved_model.fit(train_X, train_y)
predictions = improved_model.predict(val_X);
mae = mean_absolute_error(predictions, val_y)
mae

54508.24020421963

**Using a Random Forest**

In [None]:
# creating new model
from sklearn.ensemble import RandomForestRegressor

new_model = RandomForestRegressor(random_state=1)
new_model.fit(train_X, train_y)

In [None]:
new_predictions = new_model.predict(val_X)
new_mae = mean_absolute_error(new_predictions, val_y)
new_mae

47622.26805647058