In [1]:
#load the dataset into pandas dataframe
#Data Collection
import pandas as pd
data=pd.read_csv("calories.csv")

In [2]:
#show the first 5 rows in te dataset
data.head()


Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,male,68,190,94,29,105,40.8,231
1,female,20,166,60,14,94,40.3,66
2,male,69,179,79,5,88,38.7,26
3,female,34,179,71,13,100,40.5,71
4,female,27,154,58,10,81,39.8,35


In [3]:
#Data processing:Data Formatting
#convert gender column to numerical values
data['Gender']=data['Gender'].replace({'male':0,'female':1}).astype(int)

In [4]:
#getting the number of rows and  columns
data.shape

(15000, 8)

In [5]:
#getting some information about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      15000 non-null  int32  
 1   Age         15000 non-null  int64  
 2   Height      15000 non-null  int64  
 3   Weight      15000 non-null  int64  
 4   Duration    15000 non-null  int64  
 5   Heart_Rate  15000 non-null  int64  
 6   Body_Temp   15000 non-null  float64
 7   Calories    15000 non-null  int64  
dtypes: float64(1), int32(1), int64(6)
memory usage: 879.0 KB


In [6]:
data.describe()

Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,0.503533,42.7898,174.465133,74.966867,15.5306,95.518533,40.025453,89.539533
std,0.500004,16.980264,14.258114,15.035657,8.319203,9.583328,0.77923,62.456978
min,0.0,20.0,123.0,36.0,1.0,67.0,37.1,1.0
25%,0.0,28.0,164.0,63.0,8.0,88.0,39.6,35.0
50%,1.0,39.0,175.0,74.0,16.0,96.0,40.2,79.0
75%,1.0,56.0,185.0,87.0,23.0,103.0,40.6,138.0
max,1.0,79.0,222.0,132.0,30.0,128.0,41.5,314.0


In [7]:
#Data processing:Data Cleaning
#chech for null values in dataset
data.isnull().sum()

Gender        0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [8]:
#Data processing:Data Cleaning
#check for duplicate rows in dataset
data.duplicated().sum()

1

In [9]:
data.loc[data.duplicated(keep=False),:]

Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
8266,1,28,164,59,9,90,39.5,40
12852,1,28,164,59,9,90,39.5,40


In [10]:
#drp the duplicated row
data=data.drop_duplicates()

In [11]:
#check if there is still duplicated rows
data.duplicated().sum()

0

In [12]:
#number of rows after deletion of duplicated
data.shape

(14999, 8)

In [13]:

#step 1:calcualte BMI
data['BMI']=data['Weight']/(data['Height'] **2)

#step 2:drop the columns height and weight
data=data.drop(['Height','Weight'],axis=1)

data.head()

Unnamed: 0,Gender,Age,Duration,Heart_Rate,Body_Temp,Calories,BMI
0,0,68,29,105,40.8,231,0.002604
1,1,20,14,94,40.3,66,0.002177
2,0,69,5,88,38.7,26,0.002466
3,1,34,13,100,40.5,71,0.002216
4,1,27,10,81,39.8,35,0.002446


In [14]:
#Exploratory Data Analysis
#bodytemperature vs duration
import plotly.express as px
fig1=px.scatter(data,x='Duration',y='Body_Temp',color='Gender',hover_data=['Age','Calories'],title='Body temperature vs duration',
                labels={'Body_Temp':'Body Temperature','Duration':'Duration of exercise'} 
)
fig1.update_traces(marker_size=7)
fig1.show()

In [15]:
#Feature Engineering
#normalize our data 
data_max=data.max()
data=data.divide(data_max)
data.head()

Unnamed: 0,Gender,Age,Duration,Heart_Rate,Body_Temp,Calories,BMI
0,0.0,0.860759,0.966667,0.820312,0.983133,0.735669,0.895734
1,1.0,0.253165,0.466667,0.734375,0.971084,0.210191,0.74902
2,0.0,0.873418,0.166667,0.6875,0.93253,0.082803,0.848163
3,1.0,0.43038,0.433333,0.78125,0.975904,0.226115,0.762273
4,1.0,0.341772,0.333333,0.632812,0.959036,0.111465,0.841289


In [16]:
#seperating features(variables) and target
x=data.drop('Calories',axis=1)
y=data['Calories']

In [17]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
print(y_train)

9838     0.633758
7689     0.652866
6557     0.156051
6872     0.429936
820      0.073248
           ...   
5191     0.480892
13419    0.222930
5390     0.130573
860      0.181529
7270     0.187898
Name: Calories, Length: 11999, dtype: float64


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
#create a linear regression model
linear_model=LinearRegression()

#train our model
linear_model.fit(x_train,y_train)
#make predictions on test set
y_pred_linear=linear_model.predict(x_test)


#evaluate model performance
mae=mean_absolute_error(y_test,y_pred_linear)
mse=mean_squared_error(y_test,y_pred_linear)
score=r2_score(y_test,y_pred_linear)
#print the evaluation metrics of the linear regression
print("linear regression")
print(f'mean absolute error {mae:.4f}')
print(f'mean squared error {mse:.4f}')
print(f'r2 score {score:.4f}')

linear regression
mean absolute error 0.0267
mean squared error 0.0013
r2 score 0.9660


In [19]:
sample=0
#second row from x_train
sample_features=x_train.iloc[sample:sample+1]

#predicted normalized calories
predicted_normalized=linear_model.predict(sample_features)

#convert predicted normalized calories to original scale
predicted_cal=predicted_normalized[0]*data_max['Calories']

#actual_calories from y_train 
actual_calories=y_train.iloc[0:1]*data_max['Calories']

print(f'actual calories are: {actual_calories.values[0]:.2f}') # show 2 digits after the decimal point
print(f'predicted calories are: {predicted_cal:.2f}')

actual calories are: 199.00
predicted calories are: 183.39


In [21]:
#we need the joblib to load the model to linearmodel.pkl(pickle file) and then to load this model from this file
import joblib
joblib.dump(linear_model,"linearmodel.pkl")

['linearmodel.pkl']