# 基于健身房运动表现数据预测各类型运动影响

## 准备数据

导库

In [447]:
import pandas as pd
import numpy as np
import scipy.stats
import plotly.express as px
import plotly.graph_objects as go
from sklearn.neighbors import LocalOutlierFactor
from plotly.subplots import make_subplots
from scipy.stats import skewtest
from scipy.stats import skewnorm
from sklearn.model_selection import train_test_split
from scipy.spatial import distance
from sklearn.metrics import mean_squared_error

读取数据集

In [448]:
original_data = pd.read_csv('./data/gym_members_exercise_tracking.csv')
original_data.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,Male,88.3,1.71,180,157,60,1.69,1313.0,Yoga,12.6,3.5,4,3,30.2
1,46,Female,74.9,1.53,179,151,66,1.3,883.0,HIIT,33.9,2.1,4,2,32.0
2,32,Female,68.1,1.66,167,122,54,1.11,677.0,Cardio,33.4,2.3,4,2,24.71
3,25,Male,53.2,1.7,190,164,56,0.59,532.0,Strength,28.8,2.1,3,1,18.41
4,38,Male,46.1,1.79,188,158,68,0.64,556.0,Strength,29.2,2.8,3,1,14.39


In [449]:
original_data['Workout_Type'].unique()

array(['Yoga', 'HIIT', 'Cardio', 'Strength'], dtype=object)

Yoga指的是 瑜伽  \
HIIT 指的是 高强度间歇训练 \
Cardio 指的是 有氧运动 \
Strength 指的是 力量训练 

以下是一个总结这四种运动对身体不同影响的表格：
| 运动类型 | 主要影响 | 其他影响 |
| --- | --- | --- |
| 瑜伽 | 提高柔韧性、平衡性和力量 | 减轻压力、提高心理福祉、改善呼吸和心血管健康、提高协调性 |
| 高强度间歇训练 | 提高心肺功能和代谢率 | 燃烧脂肪、减少体重、提高肌肉耐力、提高适应能力和运动表现 |
| 有氧运动 | 提高心肺功能 | 燃烧脂肪、减少体重、提高耐力、降低患慢性疾病的风险 |
| 力量训练 | 增加肌肉力量和耐力 | 提高代谢率、增加肌肉质量和骨密度、改善姿态和平衡性、减少受伤风险 |


对字符串类型映射转换，方便后期处理

In [None]:
gender_reflection = {
    'Male': 1,
    'Female': 2
}
workout_type_reflection = {
    'Yoga': 1,
    'HIIT': 2,
    'Strength': 3,
    'Cardio': 4
}
data = original_data.copy()
data['Gender'] = original_data['Gender'].map(gender_reflection)
data['Workout_Type'] = original_data['Workout_Type'].map(
    workout_type_reflection)
data.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,1,88.3,1.71,180,157,60,1.69,1313.0,1,12.6,3.5,4,3,30.2
1,46,2,74.9,1.53,179,151,66,1.3,883.0,2,33.9,2.1,4,2,32.0
2,32,2,68.1,1.66,167,122,54,1.11,677.0,4,33.4,2.3,4,2,24.71
3,25,1,53.2,1.7,190,164,56,0.59,532.0,3,28.8,2.1,3,1,18.41
4,38,1,46.1,1.79,188,158,68,0.64,556.0,3,29.2,2.8,3,1,14.39


## 数据分析

分析各类锻炼的比重

In [451]:
series_workout_type = original_data['Workout_Type'].value_counts()
series_workout_type.dtype, series_workout_type

(dtype('int64'),
 Workout_Type
 Strength    258
 Cardio      255
 Yoga        239
 HIIT        221
 Name: count, dtype: int64)

In [None]:
def pie_pct_func(pct, allvals):
    absolute = int(pct/100.*sum(allvals))
    return "{:.1f}%\n({:d})".format(pct, absolute)


fig = px.pie(values=series_workout_type, names=series_workout_type.index,
             title='Workout Type Distribution',
             labels={'index': 'Workout Type', 'value': 'Count'},
             hole=0.1)
fig.update_layout(height=600, width=600)
fig.update_traces(textinfo='percent+label',
                  texttemplate='%{label}<br>%{percent:.1%}<br>(%{value})')
fig.show()

分析性别所占的比重

In [None]:
series_gender = original_data['Gender'].value_counts()
series_gender.dtype, series_gender

(dtype('int64'),
 Gender
 Male      511
 Female    462
 Name: count, dtype: int64)

In [None]:
import plotly.express as px

fig = px.pie(values=series_gender, names=series_gender.index,
             title='Gender Distribution',
             labels={'index': 'Gender', 'value': 'Count'},
             hole=0.1)
fig.update_layout(height=600, width=600)
fig.update_traces(textinfo='percent+label',
                  texttemplate='%{label}<br>%{percent:.1%}<br>(%{value})')
fig.show()

计算相关系数

In [455]:
data.head(1)

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,1,88.3,1.71,180,157,60,1.69,1313.0,1,12.6,3.5,4,3,30.2


In [None]:
# 选择特定的列
feature_columns = ['Age', 'Height (m)', 'Max_BPM', 'Avg_BPM',
                   'Resting_BPM', 'Calories_Burned', 'Session_Duration (hours)',
                   'Water_Intake (liters)', 'Workout_Frequency (days/week)', 'Workout_Type']
label_columns = ['Weight (kg)', 'BMI', 'Fat_Percentage', 'Experience_Level']
X = data[feature_columns]
y = data[label_columns]
correlation_results = pd.DataFrame()
# 逐列计算相关性并存储
for col in y.columns:
    correlation_results[col] = X.corrwith(y[col])
correlation_results

Unnamed: 0,Weight (kg),BMI,Fat_Percentage,Experience_Level
Age,-0.03634,-0.013691,0.00237,-0.018676
Height (m),0.365321,-0.159469,-0.235521,-0.010267
Max_BPM,0.057061,0.067105,-0.009056,0.000545
Avg_BPM,0.009717,0.021605,-0.007302,-0.000888
Resting_BPM,-0.032138,-0.032543,-0.016834,0.001758
Calories_Burned,0.095443,0.059761,-0.597615,0.694129
Session_Duration (hours),-0.013666,-0.006493,-0.58152,0.764768
Water_Intake (liters),0.394276,0.213697,-0.588683,0.304104
Workout_Frequency (days/week),-0.011769,0.001645,-0.53706,0.837079
Workout_Type,0.00282,0.031594,0.066787,-0.061118


In [457]:
correlation_results.sort_values(by='Weight (kg)', ascending=False)

Unnamed: 0,Weight (kg),BMI,Fat_Percentage,Experience_Level
Water_Intake (liters),0.394276,0.213697,-0.588683,0.304104
Height (m),0.365321,-0.159469,-0.235521,-0.010267
Calories_Burned,0.095443,0.059761,-0.597615,0.694129
Max_BPM,0.057061,0.067105,-0.009056,0.000545
Avg_BPM,0.009717,0.021605,-0.007302,-0.000888
Workout_Type,0.00282,0.031594,0.066787,-0.061118
Workout_Frequency (days/week),-0.011769,0.001645,-0.53706,0.837079
Session_Duration (hours),-0.013666,-0.006493,-0.58152,0.764768
Resting_BPM,-0.032138,-0.032543,-0.016834,0.001758
Age,-0.03634,-0.013691,0.00237,-0.018676


In [458]:
correlation_results.sort_values(by='BMI', ascending=False)

Unnamed: 0,Weight (kg),BMI,Fat_Percentage,Experience_Level
Water_Intake (liters),0.394276,0.213697,-0.588683,0.304104
Max_BPM,0.057061,0.067105,-0.009056,0.000545
Calories_Burned,0.095443,0.059761,-0.597615,0.694129
Workout_Type,0.00282,0.031594,0.066787,-0.061118
Avg_BPM,0.009717,0.021605,-0.007302,-0.000888
Workout_Frequency (days/week),-0.011769,0.001645,-0.53706,0.837079
Session_Duration (hours),-0.013666,-0.006493,-0.58152,0.764768
Age,-0.03634,-0.013691,0.00237,-0.018676
Resting_BPM,-0.032138,-0.032543,-0.016834,0.001758
Height (m),0.365321,-0.159469,-0.235521,-0.010267


In [459]:
correlation_results.sort_values(by='Fat_Percentage', ascending=False)

Unnamed: 0,Weight (kg),BMI,Fat_Percentage,Experience_Level
Workout_Type,0.00282,0.031594,0.066787,-0.061118
Age,-0.03634,-0.013691,0.00237,-0.018676
Avg_BPM,0.009717,0.021605,-0.007302,-0.000888
Max_BPM,0.057061,0.067105,-0.009056,0.000545
Resting_BPM,-0.032138,-0.032543,-0.016834,0.001758
Height (m),0.365321,-0.159469,-0.235521,-0.010267
Workout_Frequency (days/week),-0.011769,0.001645,-0.53706,0.837079
Session_Duration (hours),-0.013666,-0.006493,-0.58152,0.764768
Water_Intake (liters),0.394276,0.213697,-0.588683,0.304104
Calories_Burned,0.095443,0.059761,-0.597615,0.694129


In [460]:
correlation_results.sort_values(by='Experience_Level', ascending=False)

Unnamed: 0,Weight (kg),BMI,Fat_Percentage,Experience_Level
Workout_Frequency (days/week),-0.011769,0.001645,-0.53706,0.837079
Session_Duration (hours),-0.013666,-0.006493,-0.58152,0.764768
Calories_Burned,0.095443,0.059761,-0.597615,0.694129
Water_Intake (liters),0.394276,0.213697,-0.588683,0.304104
Resting_BPM,-0.032138,-0.032543,-0.016834,0.001758
Max_BPM,0.057061,0.067105,-0.009056,0.000545
Avg_BPM,0.009717,0.021605,-0.007302,-0.000888
Height (m),0.365321,-0.159469,-0.235521,-0.010267
Age,-0.03634,-0.013691,0.00237,-0.018676
Workout_Type,0.00282,0.031594,0.066787,-0.061118


In [None]:
fig = px.imshow(correlation_results,
                labels=dict(x="Features", y="Labels",
                            color="Correlation"),
                x=correlation_results.columns,
                text_auto=True,
                y=correlation_results.index,
                color_continuous_scale='RdBu',
                zmin=-1, zmax=1,
                aspect=True,
                title='Correlation Heatmap')

fig.update_layout(width=800, height=600)
fig.show()

### 去除噪声

每次运动时长区间与BMI的关系，绘图

In [None]:
bins = [0, 0.5, 1, 1.5, 2]
labels = ['0-0.5', '0.5-1', '1-1.5', '1.5-2']
session_duration_binned = pd.cut(
    data['Session_Duration (hours)'], bins=bins, labels=labels)
mean_bmi_per_bin = data.groupby(
    session_duration_binned, observed=False)['BMI'].mean()
fig = px.bar(mean_bmi_per_bin, x=mean_bmi_per_bin.index, y=mean_bmi_per_bin.values,
             labels={'x': 'Session Duration (hours)', 'y': 'Mean BMI'},
             title='Mean BMI for each Session Duration Bin')
fig.show()

每次运动时长与体重的关系绘图

In [None]:
mean_bmi_per_bin = data.groupby(
    'Workout_Frequency (days/week)', observed=False)['BMI'].mean()
fig = px.bar(mean_bmi_per_bin,
             x=mean_bmi_per_bin.index,
             y=mean_bmi_per_bin.values,
             labels={'x': 'Workout Frequency (days/week)', 'y': 'Mean BMI'},
             )
fig.show()

筛选喜欢有氧运动且偏肥胖的会员数据

In [None]:
obesity_data = data[data['BMI'] >= 24].loc[data['Workout_Type'] == 4]
mean_bmi_per_bin = obesity_data.groupby(
    'Workout_Frequency (days/week)', observed=False)['BMI'].mean()
fig = px.bar(mean_bmi_per_bin,
             x=mean_bmi_per_bin.index,
             y=mean_bmi_per_bin.values,
             labels={'x': 'Workout Frequency (days/week)', 'y': 'Mean BMI'},
             title='Mean BMI for Obese People doing Cardio')
fig.show()

In [None]:
# Define the bins for Session_Duration
bins = [0.5, 1, 1.5, 2]
labels = ['0.5-1', '1-1.5', '1.5-2']
# Create a new column for the binned Session_Duration
obesity_data['Session_Duration_Binned'] = pd.cut(
    obesity_data['Session_Duration (hours)'], bins=bins, labels=labels)
# Calculate the mean BMI for each bin
mean_bmi_per_bin = obesity_data.groupby(
    'Session_Duration_Binned', observed=False)['BMI'].mean()
fig = px.bar(mean_bmi_per_bin,
             x=mean_bmi_per_bin.index,
             y=mean_bmi_per_bin.values,
             labels={'x': 'Session Duration (hours)', 'y': 'Mean BMI'},
             title='Mean BMI for Obese People for each Session Duration Bin')
fig.show()

各个连续型随机变量的正态分布图

In [466]:
successive_variables = ['Height (m)', 'Max_BPM', 'Avg_BPM',
                        'Resting_BPM', 'Calories_Burned', 'Water_Intake (liters)',
                        'Water_Intake (liters)', 'BMI', 'Fat_Percentage'
                        ]
fig = make_subplots(rows=len(successive_variables) // 3 +
                    len(successive_variables) % 3, cols=3, subplot_titles=successive_variables)
for i, var in enumerate(successive_variables):
    x = data[var].dropna()
    kde = scipy.stats.gaussian_kde(x)
    x_range = np.linspace(x.min(), x.max(), 1000)
    y = kde(x_range)
    fig.add_trace(go.Scatter(x=x_range, y=y, fill='tozeroy',
                  name=var), row=i // 3 + 1, col=i % 3 + 1)

fig.update_layout(height=800, width=1200,
                  title_text="PDF of Successive Variables")
fig.show()

### 四分位距法检测心率相关特征的异常值

定义异常值检测方法

In [None]:
def IQR_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data < lower_bound) | (data > upper_bound)]


IQR_outliers(data['Resting_BPM']), IQR_outliers(
    data['Max_BPM']), IQR_outliers(data['Avg_BPM'])

(Series([], Name: Resting_BPM, dtype: int64),
 Series([], Name: Max_BPM, dtype: int64),
 Series([], Name: Avg_BPM, dtype: int64))

### Z分数法检测卡路里燃烧量异常值

In [None]:
def Z_score_outliers(data, threshold=2.5):
    mean = np.mean(data)
    std_dev = np.std(data)
    Z = (data - mean) / std_dev
    return data[(Z > threshold) | (Z < -threshold)]


Z_score_outliers(data['Calories_Burned'])

66     1587.0
90     1688.0
99     1625.0
124    1701.0
475    1622.0
511    1725.0
572    1646.0
646    1675.0
712    1766.0
728    1634.0
736    1598.0
910    1783.0
Name: Calories_Burned, dtype: float64

### 使用偏正态模型检测BMI，体脂率异常值

In [None]:
def skewnorm_outliers(data, threshold=0.05):
    _, p_value = skewtest(data)
    a, loc, scale = skewnorm.fit(data)
    if not p_value < 0.05:
        a, loc, scale = 0, np.mean(data), np.std(data)
    probabilities = skewnorm.cdf(data, a, loc, scale)
    return data[probabilities < threshold]


print(skewnorm_outliers(data['Fat_Percentage']))
print(skewnorm_outliers(data['BMI']))

0      12.6
34     10.2
51     12.8
62     12.9
90     10.9
       ... 
920    11.8
942    11.4
954    10.7
966    11.8
968    10.0
Name: Fat_Percentage, Length: 66, dtype: float64
4      14.39
15     13.88
17     13.98
45     15.31
72     15.24
130    14.93
131    15.08
147    13.23
150    14.54
156    12.47
169    13.03
185    14.61
194    14.69
238    15.02
249    15.24
258    12.73
282    13.71
290    15.02
334    13.51
339    14.88
342    15.07
343    14.36
358    15.61
385    15.46
387    15.59
389    14.78
401    14.10
411    15.50
425    12.67
431    14.85
441    14.59
449    15.50
474    15.50
493    15.43
504    14.87
537    13.78
581    14.85
586    13.81
598    15.49
636    15.23
637    15.42
705    12.91
726    14.87
747    12.85
757    14.60
772    15.14
783    12.32
786    13.95
789    13.82
829    15.51
867    13.36
873    14.74
874    13.38
908    12.97
909    14.57
925    14.07
963    14.30
Name: BMI, dtype: float64


In [None]:
def knn_outlier_detection(data, k=1, threshold=1.5):
    data_array = np.array(data).reshape(-1, 1)
    dist_matrix = distance.cdist(data_array, data_array, 'euclidean')
    avg_distances = np.zeros(len(data))
    for i in range(len(data)):
        distances = dist_matrix[i][np.argsort(dist_matrix[i])][1:k+1]
        avg_distances[i] = np.mean(distances)
    median_distance = np.median(avg_distances)
    return data[avg_distances > median_distance * threshold]


print(knn_outlier_detection(data['Height (m)']))
print(knn_outlier_detection(data['Session_Duration (hours)']))

Series([], Name: Height (m), dtype: float64)
23     0.78
145    1.53
409    0.50
529    1.68
728    2.00
Name: Session_Duration (hours), dtype: float64


### 使用LOF检测饮水量异常值

In [None]:
def lof_outlier_detection(data, n_neighbors=20, threshold=1.5):
    data_array = np.array(data).reshape(-1, 1)
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination='auto')
    lof.fit(data_array)
    scores_pred = lof.negative_outlier_factor_
    threshold = np.percentile(scores_pred, 90)
    return data[scores_pred < threshold]


lof_outlier_detection(data['Water_Intake (liters)'])

46     3.6
52     3.6
65     1.5
69     3.6
72     1.5
82     1.5
108    1.5
210    1.5
227    1.5
234    3.6
308    3.6
338    3.6
343    3.6
344    3.6
347    1.5
368    1.5
378    1.5
414    3.6
437    1.5
487    3.6
493    1.5
507    3.6
533    1.5
564    3.6
571    3.6
597    1.5
599    1.5
644    3.6
662    3.6
666    1.5
706    3.6
707    1.5
749    1.5
846    3.6
853    1.5
856    1.5
959    3.6
961    1.5
Name: Water_Intake (liters), dtype: float64

### 过滤所有异常值

In [None]:
l1 = len(data)


def IQR_outliers_filter(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]


data = IQR_outliers_filter(data, 'Resting_BPM')
data = IQR_outliers_filter(data, 'Max_BPM')
data = IQR_outliers_filter(data, 'Avg_BPM')


def Z_score_outliers_filter(df, column, threshold=2.5):
    data = df[column]
    mean = np.mean(data)
    std_dev = np.std(data)
    Z = (data - mean) / std_dev
    return df[(Z <= threshold) & (Z >= -threshold)]


data = Z_score_outliers_filter(data, 'Calories_Burned')


def skewnorm_outliers_filter(df, column, threshold=0.05):
    data = df[column]
    _, p_value = skewtest(data)
    a, loc, scale = skewnorm.fit(data)
    if not p_value < 0.05:
        a, loc, scale = 0, np.mean(data), np.std(data)
    probabilities = skewnorm.cdf(data, a, loc, scale)
    return df[probabilities >= threshold]


data = skewnorm_outliers_filter(data, 'Fat_Percentage')


def knn_outlier_filter(df, column, k=1, threshold=1.5):
    data = df[column]
    data_array = np.array(data).reshape(-1, 1)
    dist_matrix = distance.cdist(data_array, data_array, 'euclidean')
    avg_distances = np.zeros(len(data))
    for i in range(len(data)):
        distances = dist_matrix[i][np.argsort(dist_matrix[i])][1:k+1]
        avg_distances[i] = np.mean(distances)
    median_distance = np.median(avg_distances)
    return df[avg_distances <= median_distance * threshold]


data = knn_outlier_filter(data, 'Height (m)')
data = knn_outlier_filter(data, 'Session_Duration (hours)')


def lof_outlier_filter(df, column, n_neighbors=20, threshold=1.5):
    data = df[column]
    data_array = np.array(data).reshape(-1, 1)
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination='auto')
    lof.fit(data_array)
    scores_pred = lof.negative_outlier_factor_
    threshold = np.percentile(scores_pred, 90)
    return df[scores_pred >= threshold]


data = lof_outlier_filter(data, 'Water_Intake (liters)')
l2 = len(data)
l1-l2

126

## 机器学习模型建立

首先分割训练数据和测试数据

In [473]:
X_train, X_test, y_train, y_test = train_test_split(
    data[[
        'Age', 'Height (m)', 'Max_BPM', 'Avg_BPM', 'Resting_BPM',
        'Calories_Burned', 'Session_Duration (hours)', 'Water_Intake (liters)',
        'Workout_Frequency (days/week)', 'Workout_Type'
    ]], data[['BMI', 'Fat_Percentage', 'Experience_Level'
              ]], test_size=0.2, random_state=42)

### 岭回归模型预测BMI和体脂率

训练模型

In [None]:
class RidgeRegression():
    def __init__(self, alpha=1, weights=None):
        self.alpha = alpha
        self.coef_ = None
        self.intercept_ = None
        self.weights = weights

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        if self.weights is not None:
            X = X * self.weights
        n, p = X.shape
        self.coef_ = np.linalg.inv(X.T @ X + self.alpha * np.eye(p)) @ X.T @ y
        self.intercept_ = np.mean(y, axis=0) - np.mean(X, axis=0) @ self.coef_

    def predict(self, X):
        if self.weights is not None:
            X = X * self.weights
        return X @ self.coef_ + self.intercept_


correlation_with_bmi = correlation_results['BMI'].abs()
weights_bmi = correlation_with_bmi / correlation_with_bmi.sum()
correlation_with_fat_percentage = correlation_results['Fat_Percentage'].abs()
weights_fat_percentage = correlation_with_fat_percentage / \
    correlation_with_fat_percentage.sum()
rr_bmi = RidgeRegression(alpha=0.1, weights=weights_bmi.values)
rr_bmi.fit(X_train, y_train[['BMI']])
y_pred_bmi = rr_bmi.predict(X_test)
rr_fat_percentage = RidgeRegression(weights=weights_fat_percentage.values)
rr_fat_percentage.fit(X_train, y_train[['Fat_Percentage']])
y_pred_fat_percentage = rr_fat_percentage.predict(X_test)

评估模型

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=y_test['BMI'], y=y_pred_bmi[0], mode='markers', name='Predicted BMI'))
fig.add_trace(go.Scatter(
    x=y_test['BMI'], y=y_test['BMI'], mode='lines', name='Fit Line'))
fig.update_layout(title='Actual vs Predicted BMI',
                  xaxis_title='Actual BMI',
                  yaxis_title='Predicted BMI')
fig.show()

In [476]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=y_test['Fat_Percentage'], y=y_pred_fat_percentage[0], mode='markers', name='Predicted Fat_Percentage'))
fig.add_trace(go.Scatter(
    x=y_test['Fat_Percentage'], y=y_test['Fat_Percentage'], mode='lines', name='Fit Line'))
fig.update_layout(title='Actual vs Predicted Fat_Percentage',
                  xaxis_title='Actual Fat_Percentage',
                  yaxis_title='Predicted Fat_Percentage')
fig.show()

### 朴素贝叶斯模型预测经验等级

In [None]:
class NaiveBayes:
    def __init__(self):
        self.class_priors_ = None
        self.feature_means_ = None
        self.feature_variances_ = None
        self.classes_ = None

    def fit(self, X, y):
        self.classes_, class_counts = np.unique(y, return_counts=True)
        self.class_priors_ = class_counts / y.shape[0]
        self.feature_means_ = np.zeros((len(self.classes_), X.shape[1]))
        self.feature_variances_ = np.zeros((len(self.classes_), X.shape[1]))

        for idx, cls in enumerate(self.classes_):
            X_cls = X[y == cls]
            self.feature_means_[idx, :] = X_cls.mean(axis=0)
            self.feature_variances_[idx, :] = X_cls.var(axis=0)

    def predict(self, X):
        log_priors = np.log(self.class_priors_)
        log_likelihoods = np.zeros((X.shape[0], len(self.classes_)))

        for idx, cls in enumerate(self.classes_):
            mean = self.feature_means_[idx]
            var = self.feature_variances_[idx]
            log_likelihoods[:, idx] = -0.5 * np.sum(np.log(2. * np.pi * var)) - 0.5 * np.sum(
                ((X - mean) ** 2) / (var + 1e-9), axis=1)

        log_posteriors = log_priors + log_likelihoods
        return self.classes_[np.argmax(log_posteriors, axis=1)]


custom_nb_model = NaiveBayes()
custom_nb_model.fit(X_train.values, y_train['Experience_Level'].values)
y_pred_custom_nb = custom_nb_model.predict(X_test.values)

评估模型

In [None]:
def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)


accuracy_custom_nb = accuracy_score(
    y_test['Experience_Level'].values, y_pred_custom_nb)
print(f'Accuracy: {accuracy_custom_nb}')

Accuracy: 0.8705882352941177


## 预测数据

In [None]:
new_data = pd.DataFrame({
    'Age': [30, 45, 50],
    'Height (m)': [1.75, 1.60, 1.80],
    'Max_BPM': [180, 170, 160],
    'Avg_BPM': [150, 140, 130],
    'Resting_BPM': [60, 70, 65],
    'Calories_Burned': [500, 600, 700],
    'Session_Duration (hours)': [1.0, 1.5, 2.0],
    'Water_Intake (liters)': [2.0, 2.5, 3.0],
    'Workout_Frequency (days/week)': [3, 4, 5],
    'Workout_Type': [1, 2, 3]
})
bmi_predictions = rr_bmi.predict(new_data)
fat_percentage_predictions = rr_fat_percentage.predict(new_data)
experience_level_predictions = custom_nb_model.predict(new_data)
print("BMI Predictions:", bmi_predictions)
print("Fat Percentage Predictions:", fat_percentage_predictions)
print("Experience Level Predictions:", experience_level_predictions)

BMI Predictions:            0
0  23.223667
1  24.269240
2  22.948237
Fat Percentage Predictions:            0
0  30.495001
1  28.159273
2  24.547234
Experience Level Predictions: [1 2 3]
