In [10]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns


In [12]:
pip install xgboost

Collecting xgboostNote: you may need to restart the kernel to use updated packages.

  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 262.6 kB/s eta 0:07:56
   ---------------------------------------- 0.1/124.9 MB 409.6 kB/s eta 0:05:05
   ---------------------------------------- 0.2/124.9 MB 1.1 MB/s eta 0:01:49
   ---------------------------------------- 0.4/124.9 MB 1.5 MB/s eta 0:01:22
   ---------------------------------------- 0.6/124.9 MB 1.9 MB/s eta 0:01:05
   ---------------------------------------- 0.7/124.9 MB 2.2 MB/s eta 0:00:58
   ---------------------------------------- 0.9/124.9 MB 2.3 MB/s eta 0:00:54
   ---------------------------------------- 1.1/124.9 MB 2.5 MB/s eta 0:00:51
   -------------

In [14]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [15]:
# 1. Reading data from CSV
def read_csv(file_path):
   return pd.read_csv(file_path)

In [16]:
#2. Getting information and statistics about over dataset
def dataset_info_statistics(data):
   # Display general information about the dataset
    print("Dataset Information:")
    print(data.info())
    print("\n")

    # Display basic statistics for numerical columns
    print("Basic Statistics for Numerical Columns:")
    print(data.describe())
    print("\n")

#3.check for the null values in the dataset
def check_null(data):
  null_counts = data.isnull().sum()
  print("Null Values in the Dataset:")
  return null_counts

#4.check for duplicated rows in the dataset
def check_duplicates(data):
   return data.duplicated().any()
   #5. getting basic analysis for numerical and categorical columns
def plot_graph(data):

    numerical_columns = data.select_dtypes(include=np.number).columns

    for column in numerical_columns:
        plt.figure(figsize=(5,3))
        sns.distplot(data[column],kde=True)
        plt.title(f"Histogram for {column}")
        plt.xlabel(column)
        plt.ylabel("Frequency")
        plt.show()

    categorical_columns = data.select_dtypes(include='object').columns
    for column in categorical_columns:
        plt.figure(figsize=(5, 3))
        sns.countplot(data[column])
        plt.title(f'Countplot for {column}')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.show()

#6. Seperate feature and target
def seperate_features_target(data,target_column):

    X = data.drop(columns=[target_column],axis=1)
    y = data[target_column]

    return X,y
#7. Train test split
def perform_train_test_split(X, y, test_size=0.20, random_state=42):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test

In [17]:
calories = read_csv('calories.csv')
exercise = read_csv('exercise.csv')

In [18]:
data = pd.merge(calories, exercise, on='User_ID')

In [20]:
data.head()

Unnamed: 0,User_ID,Calories,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,14733363,231.0,male,68,190.0,94.0,29.0,105.0,40.8
1,14861698,66.0,female,20,166.0,60.0,14.0,94.0,40.3
2,11179863,26.0,male,69,179.0,79.0,5.0,88.0,38.7
3,16180408,71.0,female,34,179.0,71.0,13.0,100.0,40.5
4,17771927,35.0,female,27,154.0,58.0,10.0,81.0,39.8


In [21]:
dataset_info_statistics(data)

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   User_ID     15000 non-null  int64  
 1   Calories    15000 non-null  float64
 2   Gender      15000 non-null  object 
 3   Age         15000 non-null  int64  
 4   Height      15000 non-null  float64
 5   Weight      15000 non-null  float64
 6   Duration    15000 non-null  float64
 7   Heart_Rate  15000 non-null  float64
 8   Body_Temp   15000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 1.0+ MB
None


Basic Statistics for Numerical Columns:
            User_ID      Calories           Age        Height        Weight  \
count  1.500000e+04  15000.000000  15000.000000  15000.000000  15000.000000   
mean   1.497736e+07     89.539533     42.789800    174.465133     74.966867   
std    2.872851e+06     62.456978     16.980264     14.258114     15.03565

In [22]:
check_null(data)

Null Values in the Dataset:


User_ID       0
Calories      0
Gender        0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
dtype: int64

In [23]:
data.columns

Index(['User_ID', 'Calories', 'Gender', 'Age', 'Height', 'Weight', 'Duration',
       'Heart_Rate', 'Body_Temp'],
      dtype='object')

In [24]:
X,y = seperate_features_target(data,'Calories')

In [25]:
X = X.drop(columns=['User_ID'])

In [26]:
X_train,X_test,y_train,y_test = perform_train_test_split(X, y, test_size=0.20, random_state=42)

In [27]:
preprocessor = ColumnTransformer(transformers=[
    ('ordinal',OrdinalEncoder(),['Gender']),
    ('num',StandardScaler(),['Age',
                            'Height',
                            'Weight',
                            'Duration',
                            'Heart_Rate',
                            'Body_Temp']),
],remainder='passthrough')

In [28]:
pipeline = Pipeline([("preprocessor",preprocessor),
                     ("model",LinearRegression())
                    ])

In [29]:
from sklearn import set_config

In [30]:
set_config(display='diagram')

In [31]:
pipeline

In [32]:
    pipeline.fit(X_train,y_train)

In [33]:
y_pred = pipeline.predict(X_test)

In [34]:
from sklearn.metrics import r2_score

In [35]:
r2_score(y_test,y_pred)

0.9672937151257295

In [36]:
from sklearn.model_selection import KFold

In [37]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [38]:
from sklearn.model_selection import cross_val_score

In [39]:
cv_results = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')

In [40]:
cv_results.mean()

0.9671402283675841

In [41]:
from sklearn.metrics import mean_absolute_error

In [42]:
mean_absolute_error(y_test,y_pred)

8.441513553849704

In [43]:
def model_scorer(model_name,model):

    output=[]


    output.append(model_name)

    pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('model',model)])

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    output.append(r2_score(y_test,y_pred))
    output.append(mean_absolute_error(y_test,y_pred))

    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')
    output.append(cv_results.mean())

    return output

In [44]:
model_dict={
    'log':LinearRegression(),
    'RF':RandomForestRegressor(),
    'XGBR':XGBRegressor(),
}

In [45]:
model_output=[]
for model_name,model in model_dict.items():
    model_output.append(model_scorer(model_name,model))

In [47]:
model_output

[['log', 0.9672937151257295, 8.441513553849704, 0.9671402283675841],
 ['RF', 0.9982281748928734, 1.7101266666666664, 0.9979297049995388],
 ['XGBR', 0.9988678909361673, 1.4981198125282924, 0.9988510864545181]]

In [48]:
preprocessor = ColumnTransformer(transformers=[
    ('ordinal',OrdinalEncoder(),['Gender']),
    ('num',StandardScaler(),['Age',
                            'Height',
                            'Weight',
                            'Duration',
                            'Heart_Rate',
                            'Body_Temp']),

],remainder='passthrough')

In [49]:
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('model',XGBRegressor())

])

In [50]:
pipeline.fit(X,y)

In [51]:
sample = pd.DataFrame({
   'Gender':'male',
    'Age':68,
    'Height':190.0,
    'Weight':94.0,
    'Duration':29.0,
    'Heart_Rate':105.0,
    'Body_Temp':40.8,
},index=[0])

In [52]:
pipeline.predict(sample)

array([231.0721], dtype=float32)

In [53]:
import pickle

In [54]:
with open('pipeline.pkl','wb') as f:
    pickle.dump(pipeline,f)

In [55]:
with open('pipeline.pkl','rb') as f:
    pipeline_saved = pickle.load(f)

In [56]:
result = pipeline_saved.predict(sample)

In [57]:
result

array([231.0721], dtype=float32)

In [59]:
!apt-get install xvfb # Install Xvfb on the system
!pip install xvfbwrapper
import pickle
import pandas as pd
from tkinter import *
import os
from xvfbwrapper import Xvfb # xvfbwrapper was not installed

# Start virtual display
vdisplay = Xvfb()
vdisplay.start()

def show_entry():

    with open('pipeline.pkl','rb') as f:
        pipeline = pickle.load(f)

    p1 = str(clicked.get())
    p2 = float(e2.get())
    p3 = float(e3.get())
    p4 = float(e4.get())
    p5 = float(e5.get())
    p6 = float(e6.get())
    p7 = float(e7.get())

    sample = pd.DataFrame({
    'Gender':[p1],
    'Age':[p2],
    'Height':[p3],
    'Weight':[p4],
    'Duration':[p5],
    'Heart_Rate':[p6],
    'Body_Temp':[p7],
},index=[0])

    result = pipeline.predict(sample)
    print(result)
    Label(master, text="Amount of Calories Burnt").grid(row=13)
    Label(master, text=result[0]).grid(row=14)


master =Tk()
master.title("Calories Burnt Prediction using Machine Learning")
label = Label(master,text = "Calories Burnt Prediction",bg = "black",
               fg = "white").grid(row=0,columnspan=2)

Label(master,text = "Select Gender").grid(row=1)
Label(master,text = "Enter Your Age").grid(row=2)
Label(master,text = "Enter Your Height").grid(row=3)
Label(master,text = "Enter Your Weight").grid(row=4)
Label(master,text = "Duration").grid(row=5)
Label(master,text = "Heart Rate").grid(row=6)
Label(master,text = "Body Temp").grid(row=7)

clicked = StringVar()
options = ['male', 'female']

e1 = OptionMenu(master , clicked , *options )
e1.configure(width=15)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)
e7 = Entry(master)


e1.grid(row=1,column=1)
e2.grid(row=2,column=1)
e3.grid(row=3,column=1)
e4.grid(row=4,column=1)
e5.grid(row=5,column=1)
e6.grid(row=6,column=1)
e7.grid(row=7,column=1)



Button(master,text="Predict",command=show_entry).grid()

mainloop()

# Stop the virtual display when done
vdisplay.stop()

'apt-get' is not recognized as an internal or external command,
operable program or batch file.




ModuleNotFoundError: No module named 'fcntl'