In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
columns = ["Sex", "Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Rings" ]
df = pd.read_csv(r"C:\Users\HP\Desktop\datasets\abalone.data.csv", names = columns )

In [5]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [6]:
df = pd.get_dummies(df, columns = ["Sex"])

In [7]:
df.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1,0


In [8]:
df.isnull().sum()

Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
Sex_F             0
Sex_I             0
Sex_M             0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Length          4177 non-null   float64
 1   Diameter        4177 non-null   float64
 2   Height          4177 non-null   float64
 3   Whole weight    4177 non-null   float64
 4   Shucked weight  4177 non-null   float64
 5   Viscera weight  4177 non-null   float64
 6   Shell weight    4177 non-null   float64
 7   Rings           4177 non-null   int64  
 8   Sex_F           4177 non-null   uint8  
 9   Sex_I           4177 non-null   uint8  
 10  Sex_M           4177 non-null   uint8  
dtypes: float64(7), int64(1), uint8(3)
memory usage: 273.4 KB


In [10]:
df1 = df.copy()
df2 = df.copy()
target = df['Rings']
df.drop(columns = {'Rings'}, inplace = True)

In [11]:
df.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,1,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0,1,0


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [13]:
xtrain,xtest,ytrain,ytest = train_test_split(df,target,test_size = 0.3, random_state = 1)

In [14]:
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [15]:
lr = LinearRegression()
lr.fit(xtrain,ytrain)
yhat_lr = lr.predict(xtest)

In [16]:
from sklearn.metrics import mean_squared_error

In [17]:
print(mean_squared_error(yhat_lr,ytest ))

4.830342365148264


In [18]:
from xgboost import XGBRegressor

In [19]:
xgbr = XGBRegressor()

In [20]:
xgbr.fit(xtrain,ytrain)
yhat_xgbr = xgbr.predict(xtest)
print(mean_squared_error(yhat_xgbr,ytest))

5.039948116403518


In [31]:
print(r2_score(ytest,yhat_xgbr))

0.49877808869237095


In [21]:
feature_imp = pd.DataFrame(xgbr.feature_importances_)
columns = pd.DataFrame(df.columns)
pd.concat([columns,feature_imp], axis = 1)

Unnamed: 0,0,0.1
0,Length,0.026994
1,Diameter,0.043428
2,Height,0.044605
3,Whole weight,0.055975
4,Shucked weight,0.094647
5,Viscera weight,0.050685
6,Shell weight,0.44613
7,Sex_F,0.054494
8,Sex_I,0.143749
9,Sex_M,0.039292


In [22]:
df_test2 = df.copy()
df_test2.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,1,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0,1,0


In [23]:
df_test2.drop(columns = {'Sex_F', 'Sex_I', 'Sex_M', 'Length'}, axis = 1, inplace = True)

In [24]:
xtrain2,xtest2,ytrain2,ytest2 = train_test_split(df_test2, target, test_size = 0.2, random_state = 0)

In [25]:
xgbr2 = XGBRegressor()
xgbr2.fit(xtrain2,ytrain2)
yhat_xgbr2 = xgbr2.predict(xtest2)

In [26]:
print(mean_squared_error(ytest2,yhat_xgbr2))

5.389653096139662


In [81]:
#rms = np.sqrt(mean_squared_error(yhat, y_test))
#rms
print("predicted values", yhat)
print("actial values", [y_test])

predicted values [13.2265625  9.109375  10.328125  ...  9.2734375 18.7890625 10.9609375]
actial values [668     13.0
1580     8.0
3784    11.0
463      5.0
2615    12.0
        ... 
1052    12.0
3439     8.0
1174     9.0
2210    18.0
2408    15.0
Name: Rings, Length: 1254, dtype: float64]


In [1]:
from sklearn.metrics import r2_score

In [27]:
print(r2_score(ytest,yhat_lr))

0.5196233420241054


In [28]:
from sklearn.model_selection import cross_val_score

In [29]:
lr1 = LinearRegression()
cv_score = cross_val_score(lr1, df,target, cv=5)
print(cv_score)

[0.42528943 0.1894657  0.49042783 0.5182331  0.45001915]


In [None]:
cv_score.