## Import the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Import the dataset

In [5]:
import os
import boto3
import re

region = boto3.Session().region_name
s3 = boto3.client("s3")

# S3 bucket for training data.
# Feel free to specify a different bucket and prefix.
data_bucket = f"sagemaker-example-files-prod-{region}"
data_prefix = "datasets/tabular/uci_abalone"


In [7]:
LOCAL_PATH = "data"
FILE_TRAIN = "abalone_dataset1_train.csv"
FILE_TEST = "abalone_dataset1_test.csv"
FILE_VALIDATION = "abalone_dataset1_validation.csv"

# downloading the train, test, and validation files from data_bucket
s3.download_file(data_bucket, f"{data_prefix}/train_csv/{FILE_TRAIN}", f"{LOCAL_PATH}/{FILE_TRAIN}")
s3.download_file(data_bucket, f"{data_prefix}/test_csv/{FILE_TEST}", f"{LOCAL_PATH}/{FILE_TEST}")
s3.download_file(data_bucket, f"{data_prefix}/validation_csv/{FILE_VALIDATION}", f"{LOCAL_PATH}/{FILE_VALIDATION}")

In [8]:
import pandas as pd  # Read in csv and store in a pandas dataframe

df = pd.read_csv(
    FILE_TRAIN,
    sep=",",
    encoding="latin1",
    names=[
        "age",
        "sex",
        "Length",
        "Diameter",
        "Height",
        "Whole.weight",
        "Shucked.weight",
        "Viscera.weight",
        "Shell.weight",
    ],
)
print(df.head(5))

   age  sex  Length  Diameter  Height  Whole.weight  Shucked.weight  \
0    8    2   0.615     0.480   0.160        1.2525          0.5850   
1   16    2   0.630     0.500   0.155        1.0050          0.3670   
2    6    3   0.295     0.220   0.070        0.1260          0.0515   
3    6    3   0.315     0.235   0.075        0.1485          0.0585   
4   10    2   0.695     0.550   0.185        1.6790          0.8050   

   Viscera.weight  Shell.weight  
0          0.2595        0.3300  
1          0.1990        0.3600  
2          0.0275        0.0350  
3          0.0375        0.0425  
4          0.4015        0.3965  


In [9]:
features = df.iloc[:,1:]
features.head()

Unnamed: 0,sex,Length,Diameter,Height,Whole.weight,Shucked.weight,Viscera.weight,Shell.weight
0,2,0.615,0.48,0.16,1.2525,0.585,0.2595,0.33
1,2,0.63,0.5,0.155,1.005,0.367,0.199,0.36
2,3,0.295,0.22,0.07,0.126,0.0515,0.0275,0.035
3,3,0.315,0.235,0.075,0.1485,0.0585,0.0375,0.0425
4,2,0.695,0.55,0.185,1.679,0.805,0.4015,0.3965


In [12]:
labels = df.iloc[:,0]
labels.head()

0     8
1    16
2     6
3     6
4    10
Name: age, dtype: int64

In [13]:
X = features.values
y = labels.values

## Splitting the dataset into the Training and Test set 

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.10,random_state=42)

  from scipy.sparse import csr_matrix, issparse


## Training the Random Forest Regression model on the Training set

In [15]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=30)
regressor.fit(X_train,y_train)

## Predicting the Test set results

In [16]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(y_pred)

[10.2   7.57  9.3   8.53 11.37 10.77 10.37  4.77 12.7   7.47 10.57  4.63
 10.77  6.47 10.07 12.1   7.73  9.3   9.63  9.3   6.27  9.03 10.37  7.73
 10.47 10.1  10.23 10.87 10.97  7.67  7.17 10.8  10.93  6.8   8.9   9.37
  6.93  6.83  8.6  12.13 10.27  8.23  9.9   7.53  7.8  11.1   7.2  11.47
  8.53 10.8   9.83 10.03  4.73 10.47  8.03  9.53  8.17  8.37  9.8  10.
  7.57 12.6   7.2  10.83  6.87 12.27 11.9  10.67 11.    9.43 10.33  7.67
  9.17  8.8  13.23 10.43  5.47  8.47  7.43  9.9  10.27 15.53  8.5   8.2
 11.23  9.1  11.2   9.63  9.87 10.47 12.93 13.07 12.6   7.13 14.63  7.53
 13.5   7.87  8.97 12.67  5.73 10.33 10.07  8.5   9.77  7.1  11.83  9.4
  6.07 10.77  8.67  9.73  8.63  6.37 12.53  8.9   7.77 11.6  11.17 10.8
  5.6  11.53  9.77  9.8   9.3  10.83 10.37 10.    7.47  7.6   9.9   9.
  9.2  11.77 13.03 11.43 15.5  10.23 10.97  4.8   8.33 10.07  8.43  7.67
 10.07  8.1  10.4   7.   14.6  10.4  11.87 12.9   9.83 15.9  12.17  9.23
  8.67 11.7   8.97 10.53  8.93 10.97  9.27 11.87 11.87  8.

## Evaluating the Model Performance

In [17]:
from sklearn.metrics import explained_variance_score, r2_score, root_mean_squared_error
print(f"explained variance: {explained_variance_score(y_pred,y_test):.4}")
print(f"r2 score: {r2_score(y_pred,y_test):.4}")
print(f"root mean squared error: {root_mean_squared_error(y_pred,y_test):.4}")

explained variance: 0.114
r2 score: 0.1129
root mean squared error: 2.213


In [18]:
y_test

array([10, 14,  8,  9, 17, 12, 11,  6, 12,  7, 10,  6,  9,  7, 10, 11,  8,
       11,  8,  6,  5, 11,  9,  9, 10, 11,  8,  9, 10,  6, 11, 16,  9,  7,
        9,  8,  7,  6,  9, 11, 10,  9, 15,  8,  8, 14,  7, 15,  7, 10, 10,
        8,  3, 14,  8, 10,  8, 13, 10,  9,  5, 11,  5, 11, 12, 13,  9, 11,
       12,  8, 10,  8,  9,  7, 16, 11,  6,  9,  7,  9, 10, 20,  7,  7,  9,
        9, 11, 12, 18, 12, 11, 11, 13,  7, 13,  7, 13,  8, 10, 15,  6, 10,
        9,  8,  9,  8, 20,  8,  6,  9,  9, 11,  8,  7, 12,  8,  7, 12, 10,
        9,  6, 11,  9,  9,  9, 14, 10, 11,  8,  7,  9,  8,  9, 10, 13, 13,
        6, 12, 12,  5,  5, 10,  7,  7,  9,  8,  8,  6, 20, 11, 11, 11, 16,
       16, 17,  9, 10, 13,  7,  9,  8, 12,  8, 12, 17, 10,  5,  9,  9,  8,
        9, 12, 12, 10,  9, 12,  7, 11,  6,  9,  8,  7, 13,  7, 10, 17,  9,
       13, 10, 21,  9,  8,  8,  8, 13, 10,  5, 13,  8,  9,  9,  9,  7, 11,
        6, 11, 11, 21, 11, 12,  7,  5, 18,  6, 12,  9, 11,  8,  9,  5, 10,
        5,  8, 14,  5,  7