In [77]:
#importing library
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso

In [78]:
#loading the data
df=pd.read_csv('../input/80-cereals/cereal.csv')
df

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Triples,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174
73,Trix,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00,27.753301
74,Wheat Chex,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445
75,Wheaties,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00,51.592193


In [79]:
#checking for null values
df.isna().sum()

name        0
mfr         0
type        0
calories    0
protein     0
fat         0
sodium      0
fiber       0
carbo       0
sugars      0
potass      0
vitamins    0
shelf       0
weight      0
cups        0
rating      0
dtype: int64

In [80]:
#getting information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      77 non-null     object 
 1   mfr       77 non-null     object 
 2   type      77 non-null     object 
 3   calories  77 non-null     int64  
 4   protein   77 non-null     int64  
 5   fat       77 non-null     int64  
 6   sodium    77 non-null     int64  
 7   fiber     77 non-null     float64
 8   carbo     77 non-null     float64
 9   sugars    77 non-null     int64  
 10  potass    77 non-null     int64  
 11  vitamins  77 non-null     int64  
 12  shelf     77 non-null     int64  
 13  weight    77 non-null     float64
 14  cups      77 non-null     float64
 15  rating    77 non-null     float64
dtypes: float64(5), int64(8), object(3)
memory usage: 9.8+ KB


In [81]:
#replacing -1 with Nan because -1 is invalid values in dataset
df=df.replace(-1,np.NaN)

In [82]:
#Now checking for null values
df.isna().sum()
#replacing null values with mean of the column
for column in ['carbo','sugars','potass']:
    df[column]=df[column].fillna(df[column].mean())

In [83]:
#dropping the name colum as it is not provided any value to the feature set
df.drop('name',axis=1,inplace=True)

# Preprocessing

In [84]:
#checking for unique values in categorical  values
{column:list(df[column].unique()) for column in ['mfr','type']}

{'mfr': ['N', 'Q', 'K', 'R', 'G', 'P', 'A'], 'type': ['C', 'H']}

In [85]:
#now replacing type column with numerical values
df['type']=df['type'].apply(lambda x:0 if x=='C' else 1)

In [86]:
#now since mfr is a categorical column with not having any order
#so applying one hot encoding in the column
dummies=pd.get_dummies(df['mfr'])
#now concating dummies column to original dataframe
df=pd.concat([df,dummies],axis=1)
#droping the original column from the dataset
df=df.drop('mfr',axis=1)

In [87]:
df

Unnamed: 0,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,...,weight,cups,rating,A,G,K,N,P,Q,R
0,0,70,4,1,130,10.0,5.0,6.0,280.000000,25,...,1.0,0.33,68.402973,0,0,0,1,0,0,0
1,0,120,3,5,15,2.0,8.0,8.0,135.000000,0,...,1.0,1.00,33.983679,0,0,0,0,0,1,0
2,0,70,4,1,260,9.0,7.0,5.0,320.000000,25,...,1.0,0.33,59.425505,0,0,1,0,0,0,0
3,0,50,4,0,140,14.0,8.0,0.0,330.000000,25,...,1.0,0.50,93.704912,0,0,1,0,0,0,0
4,0,110,2,2,200,1.0,14.0,8.0,98.666667,25,...,1.0,0.75,34.384843,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,0,110,2,1,250,0.0,21.0,3.0,60.000000,25,...,1.0,0.75,39.106174,0,1,0,0,0,0,0
73,0,110,1,1,140,0.0,13.0,12.0,25.000000,25,...,1.0,1.00,27.753301,0,1,0,0,0,0,0
74,0,100,3,1,230,3.0,17.0,3.0,115.000000,25,...,1.0,0.67,49.787445,0,0,0,0,0,0,1
75,0,100,3,1,200,3.0,17.0,3.0,110.000000,25,...,1.0,1.00,51.592193,0,1,0,0,0,0,0


In [88]:
#now storing the target and feature column to the variables
#target column
y=df['rating']
#feature dataset
x=df.drop('rating',axis=1)

In [89]:
#now scaling the feature dataset
scaler=StandardScaler()

pd.DataFrame(scaler.fit_transform(x),columns=x.columns,index=x.index)

Unnamed: 0,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,A,G,K,N,P,Q,R
0,-0.201347,-1.905397,1.337319,-0.012988,-0.356306,3.314439,-2.542013,-0.237495,2.627053,-0.14627,0.957813,-0.198067,-2.123870,-0.114708,-0.632456,-0.652630,3.439961,-0.363803,-0.340503,-0.340503
1,-0.201347,0.677623,0.417912,3.987349,-1.737087,-0.064172,-1.764055,0.225316,0.526376,-1.27255,0.957813,-0.198067,0.774053,-0.114708,-0.632456,-0.652630,-0.290701,-0.363803,2.936835,-0.340503
2,-0.201347,-1.905397,1.337319,-0.012988,1.204578,2.892113,-2.023374,-0.468901,3.206550,-0.14627,0.957813,-0.198067,-2.123870,-0.114708,-0.632456,1.532262,-0.290701,-0.363803,-0.340503,-0.340503
3,-0.201347,-2.938605,1.337319,-1.013072,-0.236238,5.003745,-1.764055,-1.625929,3.351425,-0.14627,0.957813,-0.198067,-1.388576,-0.114708,-0.632456,1.532262,-0.290701,-0.363803,-0.340503,-0.340503
4,-0.201347,0.161019,-0.501495,0.987096,0.484170,-0.486498,-0.208138,0.225316,0.000000,-0.14627,0.957813,-0.198067,-0.307262,-0.114708,-0.632456,-0.652630,-0.290701,-0.363803,-0.340503,2.936835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,-0.201347,0.161019,-0.501495,-0.012988,1.084510,-0.908824,1.607098,-0.931712,-0.560180,-0.14627,0.957813,-0.198067,-0.307262,-0.114708,1.581139,-0.652630,-0.290701,-0.363803,-0.340503,-0.340503
73,-0.201347,0.161019,-1.420902,-0.012988,-0.236238,-0.908824,-0.467457,1.150938,-1.067240,-0.14627,-0.251230,-0.198067,0.774053,-0.114708,1.581139,-0.652630,-0.290701,-0.363803,-0.340503,-0.340503
74,-0.201347,-0.355585,0.417912,-0.012988,0.844374,0.358155,0.569820,-0.931712,0.236628,-0.14627,-1.460273,-0.198067,-0.653283,-0.114708,-0.632456,-0.652630,-0.290701,-0.363803,-0.340503,2.936835
75,-0.201347,-0.355585,0.417912,-0.012988,0.484170,0.358155,0.569820,-0.931712,0.164191,-0.14627,-1.460273,-0.198067,0.774053,-0.114708,1.581139,-0.652630,-0.290701,-0.363803,-0.340503,-0.340503


In [90]:
#spliting the dataset into training and testing dataset
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=1)

In [91]:
#training the model
model=LinearRegression()
l1_model=Ridge(alpha=1.0)
l2_model=Lasso(alpha=1.0)

In [92]:
#fitting the feature dataset into model
model.fit(x_train,y_train)
l1_model.fit(x_train,y_train)
l2_model.fit(x_train,y_train)

Lasso()

In [93]:
#checking the score for each model
model_score=model.score(x_test,y_test)
l1_score=l1_model.score(x_test,y_test)
l2_score=l2_model.score(x_test,y_test)

In [94]:
print('R^2 Score with Linear Regression:{:.2f}'.format(model_score))
print('R^2 Score with Ridge :{:.2f}'.format(l1_score))
print('R^2 Score with Lasso :{:.2f}'.format(l2_score))



R^2 Score with Linear Regression:0.94
R^2 Score with Ridge :0.96
R^2 Score with Lasso :0.95
