In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:

data = pd.read_csv('../INFOI/demo.csv')

print(data.head())

   subj_id  gender  age age_group  height_in  weight_lbs        bmi   race  \
0        1    male   23     20-25         72         165  22.375579  white   
1        2  female   45     40-45         69         141  20.819786  white   
2        3  female   29     25-30         73         154  20.315631  white   
3        4    male   43     40-45         71         185  25.799445  white   
4        5    male   47     45-50         75         238  29.744711  white   

   step_per_min  
0        108.65  
1        120.42  
2        113.40  
3        107.57  
4        105.79  


In [3]:
# Define the predictor variables (features) and the target variable
X = data[['age', 'gender', 'bmi']]
y = data['step_per_min']

In [4]:
# Define a preprocessing pipeline for the categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'bmi']),
        ('cat', OneHotEncoder(), ['gender'])
    ])


In [5]:
# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
model.fit(X, y)


In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model on the training set
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Mean Squared Error: 681.4315304120028
R^2 Score: -0.032106406283171296


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Update the model pipeline with RandomForestRegressor
model_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Evaluate using cross-validation
scores = cross_val_score(model_rf, X, y, cv=5, scoring='neg_mean_squared_error')
mean_mse = -scores.mean()

print(f"Mean Cross-Validated MSE for Random Forest: {mean_mse}")


Mean Cross-Validated MSE for Random Forest: 200.19984772985632


In [8]:
from sklearn.linear_model import Ridge

# Update the model pipeline with Ridge regression
model_ridge = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))  # You can adjust the alpha parameter
])

# Train and evaluate the Ridge model
model_ridge.fit(X_train, y_train)
y_pred_ridge = model_ridge.predict(X_test)

ridge_mse = mean_squared_error(y_test, y_pred_ridge)
ridge_r2 = r2_score(y_test, y_pred_ridge)

print(f"Ridge MSE: {ridge_mse}")
print(f"Ridge R^2 Score: {ridge_r2}")


Ridge MSE: 680.3475219347292
Ridge R^2 Score: -0.03046455080110566


In [9]:
from sklearn.model_selection import GridSearchCV

# Set up parameter grid for Random Forest
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10]
}

# Define the pipeline with Random Forest and hyperparameter tuning
model_rf_tuned = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Grid search
grid_search = GridSearchCV(model_rf_tuned, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print(f"Best Parameters for Random Forest: {best_params}")
print(f"Best Cross-Validated MSE: {best_score}")


Best Parameters for Random Forest: {'regressor__max_depth': None, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 200}
Best Cross-Validated MSE: 192.263734314832


In [10]:
best_model = grid_search.best_estimator_
feature_importances = best_model.named_steps['regressor'].feature_importances_

# Get feature names after one-hot encoding
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()

# Display feature importances
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

              Feature  Importance
1            num__bmi    0.773824
0            num__age    0.171257
3    cat__gender_male    0.032095
2  cat__gender_female    0.022823


In [11]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Define a preprocessing pipeline, including polynomial features for BMI
# PolynomialFeatures(degree=2) will add `bmi^2` to the features
preprocessor_with_poly = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=2, include_bias=False))  # Generates `bmi^2`
        ]), ['age', 'bmi']),
        ('cat', OneHotEncoder(), ['gender'])
    ]
)

# Define the Random Forest model pipeline with the modified preprocessor
model_rf_poly = Pipeline(steps=[
    ('preprocessor', preprocessor_with_poly),
    ('regressor', RandomForestRegressor(n_estimators=200, max_depth=None, min_samples_split=5, random_state=42))
])

# Evaluate the model using cross-validation
scores_poly = cross_val_score(model_rf_poly, X, y, cv=5, scoring='neg_mean_squared_error')
mean_mse_poly = -scores_poly.mean()

print(f"Mean Cross-Validated MSE for Random Forest with BMI^2 term: {mean_mse_poly}")


Mean Cross-Validated MSE for Random Forest with BMI^2 term: 238.20217993389923


# Selected Model

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'bmi']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['gender'])  # Ignore unknown categories in 'gender'
    ]
)

# Define the Random Forest model pipeline with the best parameters
best_rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200, max_depth=None, min_samples_split=5, random_state=42))
])

# Fit the model on the entire dataset
best_rf_model.fit(X, y)


In [15]:
new_participant = pd.DataFrame({
    'age': [30],
    'gender': ['Male'],
    'bmi': [25.0]
})

# Predict the step counts per minute for the new participant
predicted_step_count = best_rf_model.predict(new_participant)

print(f"Predicted Step Count per Minute for New Participant: {predicted_step_count[0]}")

Predicted Step Count per Minute for New Participant: 112.7067578373016
