In [1]:
import pandas as pd
import numpy as np
from random import randint
from sklearn.datasets import load_boston

Step 1: Getting Data Ready

In [2]:
# Loading boston dataset into a Pandas DataFrame
boston_dataset = load_boston()
boston_df = pd.DataFrame(boston_dataset["data"], columns=boston_dataset["feature_names"])
boston_df["target"] = pd.Series(boston_dataset["target"])


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [3]:
# Printing head of DataFrame to make sure the data has been loaded correctly
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [4]:
# Creating X & Y, where:
# X: Feature columns in DataFrame (whatever is needed for predicting)
# Y: Column to be predicted
X = boston_df.drop('target',axis = 1)
Y = boston_df['target']

In [5]:
# Splitting dataset into test & train sets using train_test_split method:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y)

In [6]:
# Looking at shapes to ensure that the dataset was split properly:
X_train.shape,Y_train.shape,X_test.shape,Y_test.shape

((379, 13), (379,), (127, 13), (127,))

In [7]:
# Looking at labels in X_test set:
print (X_test.index)

Int64Index([364, 155, 322, 256,  49,  88, 147, 457, 193, 449,
            ...
             77,  53,  55, 382, 196,  64, 493, 478, 202,  69],
           dtype='int64', length=127)


Step 2: Choosing the Model
Done by following the Model Selection Map. In this example, it's a Regression problem.
<br>This step involves trying different models.
<br>RandomForestRegressor from sklearn.ensemble is used.

In [8]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

Step 3: Fitting Model to Data & Making Predictions

In [9]:
# Fitting data to model using fit () method on train sets
model.fit(X_train, Y_train)

In [10]:
# Making predictions using predict() method:
Y_pred = model.predict(X_test)

In [11]:
# Getting a random index from the X_test set
random_index = X_test.index[randint(0,X_test.shape[0])]

In [12]:
# Making a prediction on a single sample (has to be array)
X_test.loc[random_index]
model.predict(np.array(X_test.loc[random_index]).reshape(1, -1))
# (-1) makes NumPy figure out how many columns this array has



array([19.123])

In [13]:
# Printing the random element from X_test DF
X_test.loc[random_index]

CRIM         0.21977
ZN           0.00000
INDUS        6.91000
CHAS         0.00000
NOX          0.44800
RM           5.60200
AGE         62.00000
DIS          6.08770
RAD          3.00000
TAX        233.00000
PTRATIO     17.90000
B          396.90000
LSTAT       16.20000
Name: 49, dtype: float64

In [14]:
# Printing the random element from Y_test Series
Y_test.loc[random_index]

19.4

In [15]:
# Printing the random element from boston_df
print(boston_df.loc[random_index])

CRIM         0.21977
ZN           0.00000
INDUS        6.91000
CHAS         0.00000
NOX          0.44800
RM           5.60200
AGE         62.00000
DIS          6.08770
RAD          3.00000
TAX        233.00000
PTRATIO     17.90000
B          396.90000
LSTAT       16.20000
target      19.40000
Name: 49, dtype: float64


Step 4: Evaluating Model

In [16]:
# Perform .score on train set first:
model.score(X_train, Y_train)

0.9821393581492547

In [17]:
# Perform .score on test set:
model.score(X_test, Y_test)

0.8609444184500319

Step 5: Improving Model


In [18]:
# Try different numbers of estimators with cross-validation and no cross-validation

from sklearn.model_selection import cross_val_score

for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestRegressor(n_estimators=i).fit(X_train, Y_train)
    print(f"Model accruacy on test set: {model.score(X_test, Y_test)}")
    print(f"Cross-validation score: {np.mean(cross_val_score(model, X, Y, cv=5)) * 100}%")
    print("")

Trying model with 10 estimators...
Model accruacy on test set: 0.8551500523556478
Cross-validation score: 62.676640626052496%

Trying model with 20 estimators...
Model accruacy on test set: 0.8531991027189121
Cross-validation score: 63.11125748931736%

Trying model with 30 estimators...
Model accruacy on test set: 0.8453236834993643
Cross-validation score: 62.198481438456476%

Trying model with 40 estimators...
Model accruacy on test set: 0.854997781144057
Cross-validation score: 62.65882789101224%

Trying model with 50 estimators...
Model accruacy on test set: 0.8589790129912277
Cross-validation score: 60.77470887142453%

Trying model with 60 estimators...
Model accruacy on test set: 0.8492097532613858
Cross-validation score: 60.33489428153602%

Trying model with 70 estimators...
Model accruacy on test set: 0.8589825538700473
Cross-validation score: 61.11766119895833%

Trying model with 80 estimators...
Model accruacy on test set: 0.8504067960308324
Cross-validation score: 61.47367433

Step 6: Saving Trained Model

In [19]:
import pickle

# Save trained model to file
pickle.dump(model, open("random_forest_model_2.pkl", "wb"))