In [None]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error

In [None]:
#reading the dataset and converting it into a dataframe
df=pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset1/master/abalone.csv')
df.head()

# Exporatory Data Analysis(EDA)

In [None]:
#lets check dimension of dataset
df.shape


There are 4177 rows and 9 columns in dataset

In [None]:
#lets check for any null values
df.isnull().sum()

There are no null values in the dataset

In [None]:
#statistical information
df.describe()


We can see that there is no much difference between mean and 50%(2nd quartile).

In [None]:
#checking datatypes
df.info()

we can see that all columns are of float datatype except "sex" which is of Object datatype and 'Rings' which is of integer datatype

In [None]:
#data visualization
plt.figure(figsize=(21,7))
plt.scatter(df['Length'],df['Rings'])
plt.xlabel("Length of Abalone(mm)")
plt.ylabel("Number of rings")
plt.show()

We can observe some kind of positive relationship in this

In [None]:
plt.figure(figsize=(21,7))
plt.scatter(df['Height'],df['Rings'])
plt.xlabel("Height of Abalone(mm)")
plt.ylabel("Number of rings")
plt.show()

We can see 2 outliers in this scatter plot. Relationship is positive

# Feature Engineering

In [None]:
# one-hot encoding -> lets convert the 'sex' column into a integer datatype
encoded_df=pd.get_dummies(df)
encoded_df.head()

WE can see that columns are added in our dataset. Whatever is the sex it will put '1' in that column. It means if the sex is male then Sex_M column will have value as 1, if sex is female then Sex_F will have the value as 1 and if the abdalone is an infant then Sex_I will have a value as 1

# Model building

In [None]:
training_set=encoded_df.iloc[:4099,:]
validation_set=encoded_df.iloc[4099:,:]

In [None]:
training_set.shape

Our training set has 4099 rows and all 11 columns

In [None]:
validation_set.shape

Our validation set has 78 rows and 11 columns

In [None]:
x=training_set[['Length','Height']]
y=training_set[['Rings']].values.reshape(-1,1)

x_val=validation_set[['Length','Height']]
y_val=validation_set['Rings'].values.reshape(-1,1)

validation set will give us an idea that how our model will predict for unseen data

In [None]:
#spliting dataset into features and target variables
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=42)

In [None]:
#multiple Regression
lin_reg=LinearRegression()
lin_reg.fit(x_train,y_train)

#test RMSE
lin_reg_pred=lin_reg.predict(x_test)
test_rmse=mean_squared_error(y_test,lin_reg_pred,squared=False)

print("Test RMSE=",test_rmse)

#validation RMSE
lin_reg_pred_val=lin_reg.predict(x_val)
val_rmse=mean_squared_error(y_val,lin_reg_pred_val,squared=False)
print("Validation RMSE=",val_rmse)

So our Test RMSE is 2.57 and validation RMSE is 1.57

In [None]:
#multiple linear regression with regularization(Ridge Regression)
ridge=Ridge()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-4,1e-3,1e-2,1,5,10,20]}
ridge_regressor = GridSearchCV(ridge,parameters,scoring='neg_root_mean_squared_error',cv=5)
ridge_regressor.fit(x_train,y_train)

print(ridge_regressor.best_params_)
print(-ridge_regressor.best_score_)

It is saying that best parameter is alpha =1. Lets retrain our model

In [None]:
ridge_reg=Ridge(alpha=1)
ridge_reg.fit(x_train,y_train)


#test RMSE
ridge_reg_pred=ridge_reg.predict(x_test)
test_rmse=mean_squared_error(y_test,ridge_reg_pred,squared=False)

print("Test RMSE=",test_rmse)

#validation RMSE
ridge_reg_pred_val=ridge_reg.predict(x_val)
val_rmse=mean_squared_error(y_val,ridge_reg_pred_val,squared=False)
print("Validation RMSE=",val_rmse)


So our Test RMSE is 2.58 and validation RMSE is 1.56. This model(with ridge regression) is slightly better than the previous model (with linear regreesion)

# Multiple linear regression lasso

In [None]:
lasso=Lasso()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-4,1e-3,1e-2,1,5,10,20]}
lasso_regressor = GridSearchCV(lasso,parameters,scoring='neg_root_mean_squared_error',cv=5)
lasso_regressor.fit(x_train,y_train)

print(lasso_regressor.best_params_)
print(-lasso_regressor.best_score_)

In [None]:
lasso_reg=Lasso(alpha=0.01)
lasso_reg.fit(x_train,y_train)


#test RMSE
lasso_reg_pred=lasso_reg.predict(x_test)
test_rmse=mean_squared_error(y_test,lasso_reg_pred,squared=False)

print("Test RMSE=",test_rmse)

#validation RMSE
lasso_reg_pred_val=lasso_reg.predict(x_val)
val_rmse=mean_squared_error(y_val,lasso_reg_pred_val,squared=False)
print("Validation RMSE=",val_rmse)


Now, lets try some other models which are tree based. In these models, we will be considering only 'Height', 'Weight' and 'Sex' of abadalone as features and target variables will be 'Rings'

In [None]:
# Tree based model
x = training_set.drop(['Rings','Diameter','Whole weight','Shucked weight','Viscera weight','Shell weight'],axis=1)
y = training_set['Rings'].values.reshape(-1,1)

x_val= validation_set.drop(['Rings','Diameter','Whole weight','Shucked weight','Viscera weight','Shell weight'],axis=1)
y_val = training_set['Rings'].values.reshape(-1,1)


In [None]:
x_train,x_test,y_train,y_test =train_test_split(x,y,test_size=0.1,random_state=42)

# Random forest regressor

In [None]:
rf=RandomForestRegressor()
parameters = {'n_estimators': [int(x) for x in np.linspace(100,1000,10)]}
rf_regressor=GridSearchCV(rf,parameters, scoring='neg_mean_squared_error',cv=5)
rf_regressor.fit(x_train,y_train.ravel())
print(rf_regressor.best_params_)
print(-rf_regressor.best_score_)

In [None]:
rf_reg=RandomForestRegressor(n_estimators=300)
rf_reg.fit(x_train,y_train.ravel())


#test RMSE
rf_reg_pred=rf_reg.predict(x_test)
test_rmse=mean_squared_error(y_test,rf_reg_pred,squared=False)

print("Test RMSE=",test_rmse)

#validation RMSE
rf_reg_pred_val=rf_reg.predict(x_val)
val_rmse=mean_squared_error(y_val,rf_reg_pred_val,squared=False)
print("Validation RMSE=",val_rmse)


In [None]:
###Bagging

In [None]:
bg=BaggingRegressor()
parameters = {'n_estimators': [int(x) for x in np.linspace(100,1000,10)],
               'max_samples':[float(x) for x in np.arange(0.1,1.1,0.1)]}
bg_regressor=GridSearchCV(bg,parameters,scoring='neg_mean_squared_error',cv=5)
bg_regressor.fit(x_train,y_train.ravel())
print(bg_regressor.best_params_)
print(-bg_regressor.best_score_)

In [None]:
bg_reg=BaggingRegressor(n_estimators=670,max_samples=0.1)
bg_reg.fit(x_train,y_train.ravel())


#test RMSE
bg_reg_pred=bg_reg.predict(x_test)
test_rmse=mean_squared_error(y_test,bg_reg_pred,squared=False)

print("Test RMSE=",test_rmse)

#validation RMSE
bg_reg_pred_val=bg_reg.predict(x_val)
val_rmse=mean_squared_error(y_val,bg_reg_pred_val,squared=False)
print("Validation RMSE=",val_rmse)


In [None]:
model_eval={
    'model':['Linear','Ridge','LAsso','random Forest','bagging'],
    'RMSE':[1.5661783782311625,1.5640813253014876,1.5563831232308813,2.2334,1.5097]
}

df_val=pd.DataFrame(data=model_eval)
df_val

RandomForestregressor is working better as comapred other regression model