In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression


In [None]:
df = pd.read_csv("calories.csv")
df.head()


In [None]:
df.isnull().sum()


In [None]:
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Height'].fillna(df['Height'].median(), inplace=True)
df['Weight'].fillna(df['Weight'].median(), inplace=True)
df['Duration'].fillna(df['Duration'].median(), inplace=True)
df['Heart_Rate'].fillna(df['Heart_Rate'].median(), inplace=True)
df['Body_Temp'].fillna(df['Body_Temp'].median(), inplace=True)
df['Calories'].fillna(df['Calories'].median(), inplace=True)


In [None]:
target = "Calories"
X = df.drop(columns=[target, "User_ID"])
y = df[target]


In [None]:
categorical_features = ["Gender"]
numeric_features = [col for col in X.columns if col not in categorical_features]


In [None]:
cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

num_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", cat_pipeline, categorical_features),
        ("num", num_pipeline, numeric_features)
    ]
)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])


In [None]:
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

rmse, r2


In [None]:
df.to_csv("cleaned_calories.csv", index=False)