In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

In [2]:
abalone = pd.read_csv("C:/Users/vineet/Desktop/Abaloneedited.csv")

In [3]:
abalone.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
Sex                2000 non-null object
Height             2000 non-null float64
Diameter           2000 non-null float64
Length             2000 non-null float64
WholeWeight        2000 non-null float64
VisceraWeight      2000 non-null float64
Shuckled Weight    2000 non-null float64
ShellWeight        2000 non-null float64
Rings              2000 non-null int64
dtypes: float64(7), int64(1), object(1)
memory usage: 140.7+ KB


In [4]:
## No missing values, "sex" is only categorical attribute

In [5]:
abalone.head()

Unnamed: 0,Sex,Height,Diameter,Length,WholeWeight,VisceraWeight,Shuckled Weight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [6]:
# No difference between M/F so we group M and F together and separate from I
sex_to_int = {"I":0, "M":1, "F":1}
abalone["Sex"].replace(to_replace=sex_to_int, inplace=True)

In [7]:
abalone.head()

Unnamed: 0,Sex,Height,Diameter,Length,WholeWeight,VisceraWeight,Shuckled Weight,ShellWeight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [8]:
# Age = Rings + 1.5
# We can get remove rings attribute from dataframe since new label is age 
abalone["age"] = abalone["Rings"] + 1.5
abalone.drop("Rings", axis=1, inplace=True)

In [9]:
abalone.head()

Unnamed: 0,Sex,Height,Diameter,Length,WholeWeight,VisceraWeight,Shuckled Weight,ShellWeight,age
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,16.5
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,8.5
2,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,10.5
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,11.5
4,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,8.5


In [10]:
X = np.array(abalone.drop("age", axis=1)) # Numerical data
y = np.array(abalone["age"]) # Label (age)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)


In [12]:
# May add more transformations, perhaps experimenting with different attribute combinations
pipeline = Pipeline([
    ("std_scaler", StandardScaler()) # feature scaling
    ])

X_train_preprocessed = pipeline.fit_transform(X_train)
X_test_preprocessed = pipeline.fit_transform(X_test)


In [13]:
# Fine-tuning
          

hyperparameters = {"n_estimators":[10, 15, 30],
                        "max_features":[2, 3, 4, 5],
                        "max_depth":[5, 10, 15, 20, 30],
                        "bootstrap":[True, False],
                        "warm_start":[True, False]}

In [14]:
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
rand_forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(rand_forest_reg, hyperparameters, cv=10,scoring="neg_mean_squared_error")

grid_search.fit(X_train_preprocessed, y_train)

model = grid_search.best_estimator_

joblib.dump(model, "abalone_model.pkl")

model = joblib.load("abalone_model.pkl")


In [15]:
predictions = model.predict(X_test_preprocessed)


In [16]:
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print("RMSE:", rmse)

RMSE: 2.393065567535393


In [17]:
r2 = model.score(X_test_preprocessed, y_test)
r = np.sqrt(r2)
print("Correlation:", r)

Correlation: 0.6729398316233758


In [18]:
print("Predictions:", predictions[:50])
print("Labels:", y_test[:50])

Predictions: [12.45394405 10.19799746  7.72359418 13.14486528 10.20318099  8.98092947
 12.56077731 12.27494706  7.32607143 12.81417989 16.23512129  9.69293413
 12.78458206 12.5667273  12.25345941 12.19306268 10.95047492 13.77141531
 10.95679987 12.30897314 12.65274357  7.34132395 11.21494968 13.14551634
 11.38068452 11.52845274 10.71858633 14.26455379 11.29969552  9.28295266
  9.49886236 11.94597611 14.70628018 11.55121177 10.5750575   8.42499688
 11.94527521  7.58330065 11.91263101 13.13743416 10.94448887 13.77649989
 13.28353306 11.19196049  9.97463988 16.99248695 12.15781306 13.22281408
 10.30630685 13.81214286]
Labels: [10.5  9.5  8.5  9.5  9.5  7.5 21.5 12.5  7.5 10.5 17.5 10.5 19.5 10.5
 15.5 10.5 16.5 15.5 10.5 10.5 12.5  6.5 12.5 16.5 11.5 12.5 10.5 10.5
 11.5  8.5  9.5 11.5 19.5 10.5  9.5  7.5 10.5  7.5 10.5 11.5  9.5 17.5
 11.5  9.5  9.5 16.5 11.5 13.5  9.5 12.5]
