In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor

In [3]:
calories = pd.read_csv('calories.csv')
exercise = pd.read_csv('exercise.csv')

In [4]:
calories.shape

(15000, 2)

In [5]:
exercise.shape

(15000, 8)

In [6]:
calories.head()

Unnamed: 0,User_ID,Calories
0,14733363,231.0
1,14861698,66.0
2,11179863,26.0
3,16180408,71.0
4,17771927,35.0


In [7]:
exercise.head()

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8


In [8]:
data = pd.merge(calories, exercise, on='User_ID')

In [9]:
data.head()

Unnamed: 0,User_ID,Calories,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,14733363,231.0,male,68,190.0,94.0,29.0,105.0,40.8
1,14861698,66.0,female,20,166.0,60.0,14.0,94.0,40.3
2,11179863,26.0,male,69,179.0,79.0,5.0,88.0,38.7
3,16180408,71.0,female,34,179.0,71.0,13.0,100.0,40.5
4,17771927,35.0,female,27,154.0,58.0,10.0,81.0,39.8


In [10]:
data.shape

(15000, 9)

In [11]:
from sklearn.preprocessing import OneHotEncoder


In [13]:
gender_encoder = OneHotEncoder()
gender_data = data[['Gender']] # select the 'gender' column

# fit and transform the 'gender' data using the encoder
gender_encoded = gender_encoder.fit_transform(gender_data)

# create a new dataframe with the encoded data
gender_encoded_df = pd.DataFrame(gender_encoded.toarray(), columns=gender_encoder.get_feature_names_out(['Gender']))

# concatenate the original dataframe and the encoded dataframe
data = pd.concat([data, gender_encoded_df], axis=1)

In [14]:
data.head()

Unnamed: 0,User_ID,Calories,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Gender_female,Gender_male
0,14733363,231.0,male,68,190.0,94.0,29.0,105.0,40.8,0.0,1.0
1,14861698,66.0,female,20,166.0,60.0,14.0,94.0,40.3,1.0,0.0
2,11179863,26.0,male,69,179.0,79.0,5.0,88.0,38.7,0.0,1.0
3,16180408,71.0,female,34,179.0,71.0,13.0,100.0,40.5,1.0,0.0
4,17771927,35.0,female,27,154.0,58.0,10.0,81.0,39.8,1.0,0.0


In [16]:
data = data.drop('Gender_female', axis=1)

In [17]:
data = data.drop('Gender_male', axis=1)

In [18]:
data.head()

Unnamed: 0,User_ID,Calories,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,14733363,231.0,male,68,190.0,94.0,29.0,105.0,40.8
1,14861698,66.0,female,20,166.0,60.0,14.0,94.0,40.3
2,11179863,26.0,male,69,179.0,79.0,5.0,88.0,38.7
3,16180408,71.0,female,34,179.0,71.0,13.0,100.0,40.5
4,17771927,35.0,female,27,154.0,58.0,10.0,81.0,39.8


In [20]:
data = data.drop('User_ID', axis=1)

In [21]:
data.head()

Unnamed: 0,Calories,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,231.0,male,68,190.0,94.0,29.0,105.0,40.8
1,66.0,female,20,166.0,60.0,14.0,94.0,40.3
2,26.0,male,69,179.0,79.0,5.0,88.0,38.7
3,71.0,female,34,179.0,71.0,13.0,100.0,40.5
4,35.0,female,27,154.0,58.0,10.0,81.0,39.8


In [22]:
from sklearn.preprocessing import LabelEncoder

In [23]:
le = LabelEncoder()

In [24]:
data['Gender'] = le.fit_transform(data['Gender'])

In [25]:
data.head()

Unnamed: 0,Calories,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,231.0,1,68,190.0,94.0,29.0,105.0,40.8
1,66.0,0,20,166.0,60.0,14.0,94.0,40.3
2,26.0,1,69,179.0,79.0,5.0,88.0,38.7
3,71.0,0,34,179.0,71.0,13.0,100.0,40.5
4,35.0,0,27,154.0,58.0,10.0,81.0,39.8


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000 entries, 0 to 14999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Calories    15000 non-null  float64
 1   Gender      15000 non-null  int64  
 2   Age         15000 non-null  int64  
 3   Height      15000 non-null  float64
 4   Weight      15000 non-null  float64
 5   Duration    15000 non-null  float64
 6   Heart_Rate  15000 non-null  float64
 7   Body_Temp   15000 non-null  float64
dtypes: float64(6), int64(2)
memory usage: 1.0 MB


In [27]:
X = data.drop('Calories', axis=1)
Y = data['Calories']

In [28]:
print(X)

       Gender  Age  Height  Weight  Duration  Heart_Rate  Body_Temp
0           1   68   190.0    94.0      29.0       105.0       40.8
1           0   20   166.0    60.0      14.0        94.0       40.3
2           1   69   179.0    79.0       5.0        88.0       38.7
3           0   34   179.0    71.0      13.0       100.0       40.5
4           0   27   154.0    58.0      10.0        81.0       39.8
...       ...  ...     ...     ...       ...         ...        ...
14995       0   20   193.0    86.0      11.0        92.0       40.4
14996       0   27   165.0    65.0       6.0        85.0       39.2
14997       0   43   159.0    58.0      16.0        90.0       40.1
14998       1   78   193.0    97.0       2.0        84.0       38.3
14999       1   63   173.0    79.0      18.0        92.0       40.5

[15000 rows x 7 columns]


In [29]:
print(Y)

0        231.0
1         66.0
2         26.0
3         71.0
4         35.0
         ...  
14995     45.0
14996     23.0
14997     75.0
14998     11.0
14999     98.0
Name: Calories, Length: 15000, dtype: float64


In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [32]:
model = XGBRegressor()

In [33]:
model.fit(X_train, Y_train)

In [36]:
from sklearn.metrics import r2_score
X_train_prediction = model.predict(X_train)
training_data_accuracy = r2_score(X_train_prediction, Y_train)
training_data_accuracy

0.9995518451184217

In [37]:
X_test_prediction = model.predict(X_test)
testing_data_accuracy = r2_score(X_test_prediction, Y_test)
testing_data_accuracy

0.9988436889000072