<h1>基于机器学习的糖尿病预测系统实现</h1>

In [1]:
# 导入pandas 和 SQLAlchemy  
import pandas as pd  
from sqlalchemy import create_engine  
  
# 创建SQLAlchemy连接引擎  
engine = create_engine('mysql+pymysql://root:root@localhost:3306/diabetes_prediction')  
  
# 执行sql操作  
sql_train = "SELECT * FROM train"  
sql_test = "SELECT * FROM test"  
dataConcat1 = pd.read_sql_query(sql_train, con=engine)  
dataConcat2 = pd.read_sql_query(sql_test, con=engine)  
dataConcat3 = pd.read_sql_query(sql_test, con=engine)  


In [2]:
dataConcat1.head()

In [3]:
dataConcat2.head()

In [4]:
dataConcat3.head()

In [5]:
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

warnings.filterwarnings("ignore")
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
plt.rcParams['font.sans-serif'] = ['KaiTi']
plt.rcParams['axes.unicode_minus'] = False
# dataConcat1 = pd.read_csv('data/比赛训练集.csv', encoding='gbk')
# dataConcat2 = pd.read_csv('data/比赛测试集.csv', encoding='gbk')
# dataConcat3 = pd.read_csv('data/比赛测试集.csv', encoding='gbk')

In [6]:
# 将出生年份换算成年龄
dataConcat1['年龄'] = datetime.now().year - dataConcat1['出生年份']  #换成年龄
dataConcat2['年龄'] = datetime.now().year - dataConcat2['出生年份']  #换成年龄
dataConcat1.drop('出生年份', axis=1, inplace=True)
dataConcat2.drop('出生年份', axis=1, inplace=True)
dataConcat1.head()

In [7]:
dataConcat2.head()

In [8]:
import missingno as msno

## 通过条形图可视化检查数据中是否有缺失值
# 训练集
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
msno.matrix(dataConcat1, ax=ax, color=(0.25, 0.25, 0.5))
plt.show()

In [9]:
# 测试集
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
msno.matrix(dataConcat2, ax=ax, color=(0.25, 0.25, 0.5))
plt.show()

In [10]:
# 对舒张压特征的缺失值使用均值填充
dataConcat1 = dataConcat1.fillna(value=dataConcat1["舒张压"].mean())
dataConcat1.info()

In [11]:
# 测试集
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
msno.matrix(dataConcat1, ax=ax, color=(0.25, 0.25, 0.5))
plt.show()

In [12]:
dataConcat2 = dataConcat2.fillna(value=dataConcat2["舒张压"].mean())
dataConcat2.info()

In [13]:
# 测试集
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
msno.matrix(dataConcat2, ax=ax, color=(0.25, 0.25, 0.5))
plt.show()

数据分桶<br/>
优势：提高模型性能、简化模型、增强模型的可解释性、优化内存使用、处理不平衡数据<br/>
劣势：信息损失<br/>


In [14]:
# 对年龄进行分桶
age = dataConcat1['年龄']
age_binary = pd.cut(age, [0, 30, 45, 60, 100], labels=[0, 1, 2, 3], right=False)
dataConcat1['年龄_cut'] = age_binary
dataConcat1['年龄_cut'] = dataConcat1['年龄_cut'].astype("float")
dataConcat1.drop('年龄', axis=1, inplace=True)
age = dataConcat2['年龄']
age_binary = pd.cut(age, [0, 30, 45, 60, 100], labels=[0, 1, 2, 3], right=False)
dataConcat2['年龄_cut'] = age_binary
dataConcat2['年龄_cut'] = dataConcat2['年龄_cut'].astype("float")
dataConcat2.drop('年龄', axis=1, inplace=True)

dataConcat1.head()

In [15]:
dataConcat2.head()

In [16]:
# 训练集 创建一个新的列来保存分类结果  
dataConcat1['糖尿病家族史_cut'] = 0  # 初始化新列为0 
dataConcat1.loc[dataConcat1['糖尿病家族史'] == '无记录', '糖尿病家族史_cut'] = 0
dataConcat1.loc[dataConcat1['糖尿病家族史'] == '叔叔或者姑姑有一方患有糖尿病', '糖尿病家族史_cut'] = 1
dataConcat1.loc[dataConcat1['糖尿病家族史'] == '父母有一方患有糖尿病', '糖尿病家族史_cut'] = 2
dataConcat1.drop('糖尿病家族史', axis=1, inplace=True)

In [17]:
# 测试集 创建一个新的列来保存分类结果  
dataConcat2['糖尿病家族史_cut'] = 0  # 初始化新列为0 
dataConcat2.loc[dataConcat2['糖尿病家族史'] == '无记录', '糖尿病家族史_cut'] = 0
dataConcat2.loc[dataConcat2['糖尿病家族史'] == '叔叔或者姑姑有一方患有糖尿病', '糖尿病家族史_cut'] = 1
dataConcat2.loc[dataConcat2['糖尿病家族史'] == '父母有一方患有糖尿病', '糖尿病家族史_cut'] = 2
dataConcat2.drop('糖尿病家族史', axis=1, inplace=True)

In [18]:
dataConcat1.head()

In [19]:
dataConcat2.head()

In [20]:
## 可视化相关系数热力图
## 计算相关系数矩阵
dataConcat1corr = dataConcat1.corr(method="pearson")
## 可视化热力图
plt.figure(figsize=(15, 8))
ax = sns.heatmap(dataConcat1corr, fmt=".2f", annot=True,
                 cmap="YlGnBu", linewidths=0.5,
                 annot_kws={"fontsize": 14})
plt.title("数据特征相关系数热力图")
plt.show()

In [21]:
# 通过热力图得知，糖尿病家族史、性别、编号均与是否患有糖尿病的相关性很小，所有把这三个特征剔除
dataConcat1.drop('糖尿病家族史_cut', axis=1, inplace=True)
dataConcat1.drop('编号', axis=1, inplace=True)
dataConcat1.drop('性别', axis=1, inplace=True)
dataConcat2.drop('糖尿病家族史_cut', axis=1, inplace=True)
dataConcat2.drop('编号', axis=1, inplace=True)
dataConcat2.drop('性别', axis=1, inplace=True)

In [22]:
dataConcat1.columns

In [23]:
dataConcat2.columns

In [24]:
X = dataConcat1[['口服耐糖量测试', '胰岛素释放实验', '肱三头肌皮褶厚度', '年龄_cut', '舒张压',
                 '体重指数']]
y = dataConcat1['患有糖尿病标识']

模型训练

In [25]:
# Logistic回归
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(solver='liblinear', multi_class='auto',max_iter=500)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Logistic回归 Model accuracy: {accuracy}')

In [26]:
# 随机森林
from sklearn.ensemble import RandomForestClassifier
rfc1 = RandomForestClassifier(n_estimators=200, max_depth=3, class_weight='balanced', oob_score=True, random_state=12,criterion="gini")
## 输出其在训练数据和测试据集上的预测精度
rfc1.fit(X_train, y_train)
rfc1_lab = rfc1.predict(X_train)  # 在训练集上的预测  
rfc1_pre = rfc1.predict(X_test)  # 在测试集上的预测 
print("随机森林 训练集 Model accuracy:", accuracy_score(y_train, rfc1_lab))
print("随机森林 测试集 Model accuracy:", accuracy_score(y_test, rfc1_pre))

In [27]:
# SVM支持向量机
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
svm = SVC(kernel='rbf')  # 高斯核
# 训练模型
svm.fit(X_train, y_train)
# 预测测试集
y_pred = svm.predict(X_test)
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Model accuracy: {accuracy}")

通过三种模型发现，随机森林模型的准确率高于逻辑回归和支持向量机，因此，我们用随机森林来对测试集进行进一步的预测。

In [28]:
## 可视化不同的决策树数量在训练集和测试集精度的变化情况
train_acc = []
test_acc = []
i = 0
numbers = np.arange(10,301,20)
for n in numbers:
    rfc1.set_params(n_estimators=n, max_depth=3, class_weight='balanced', oob_score=True, random_state=12,criterion="gini")
    rfc1.fit(X_train,y_train)
    ## 输出其在训练数据和测试据集上的预测精度
    rfc1_lab = rfc1.predict(X_train)
    rfc1_pre = rfc1.predict(X_test)
    train_acc.append(accuracy_score(y_train,rfc1_lab))
    test_acc.append(accuracy_score(y_test,rfc1_pre))
    print(f"树==> {n} 训练集精度==> {train_acc[i]:.4f} 测试集精度==> {test_acc[i]:.4f} 训练集和测试集的差距==> {math.sqrt((train_acc[i] - test_acc[i]) ** 2):.9f}")
    i += 1

In [29]:
## 可视化
plt.figure(figsize=(12,6))
plt.plot(numbers,train_acc,"b-o",label = "训练集精度")
plt.plot(numbers,test_acc,"r-s",label = "测试集精度")
plt.xlabel("树的数量")
plt.ylabel("精度")
plt.title("随机森林分类器")
plt.legend()
plt.show()


决策树的数量为110时，精度开始接近稳定

In [30]:
dataConcat2 = dataConcat2[X_train.columns]

In [31]:
rfc1.set_params(n_estimators=110, max_depth=3, class_weight='balanced', oob_score=True, random_state=12,criterion="gini")
rfc1.fit(X_train,y_train)
dataConcat2_pre = rfc1.predict(dataConcat2)

In [32]:
dataConcat2['患有糖尿病标识'] = dataConcat2_pre
dataConcat2['编号'] = dataConcat3['编号']
engine = create_engine('mysql+pymysql://root:root@localhost:3306/diabetes_prediction')  
dataConcat2.to_sql('success', con=engine, if_exists='replace', index=False)

In [33]:
dataConcat2.to_csv('data/success.csv', index=False,encoding='gbk')


In [34]:
import pandas as pd
import matplotlib.pyplot as plt

train_file_path = 'data/比赛训练集.csv'
test_file_path = 'data/比赛测试集.csv'

train_data = pd.read_csv(train_file_path, encoding='gbk')
test_data = pd.read_csv(test_file_path, encoding='gbk')

bins = [0, 18.5, 24, 28, 32, 36, 40, 100]
labels = ['偏瘦', '正常体重', '超重', '轻度肥胖', '中度肥胖', '重度肥胖', '极重度肥胖']

train_data['BMI区间'] = pd.cut(train_data['体重指数'], bins=bins, labels=labels, right=False)

male_bmi_distribution = train_data[train_data['性别'] == 1]['BMI区间'].value_counts().sort_index()
female_bmi_distribution = train_data[train_data['性别'] == 0]['BMI区间'].value_counts().sort_index()

bmi_distribution = pd.DataFrame({
    '男性': male_bmi_distribution,
    '女性': female_bmi_distribution
})

bmi_distribution.plot(kind='barh', figsize=(12, 8), color=['blue', 'pink'])
plt.title('男性和女性在不同体重指数区间的分布', fontsize=16)
plt.xlabel('人数', fontsize=14)
plt.ylabel('体重指数区间', fontsize=14)
plt.grid(True)
plt.show()

In [35]:
train_data['年龄'] = 2024 - train_data['出生年份']
age_bmi = train_data.groupby('年龄')['体重指数'].mean()
plt.figure(figsize=(10, 6))
age_bmi.plot(kind='line', marker='o', color='b')
plt.title('不同年龄段的体重指数平均值', fontsize=16)
plt.xlabel('年龄', fontsize=14)
plt.ylabel('体重指数平均值', fontsize=14)
plt.grid(True)
plt.show()

In [36]:
family_history_counts = train_data['糖尿病家族史'].value_counts()
plt.figure(figsize=(8, 8))
family_history_counts.plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=['lightcoral', 'lightskyblue'])
plt.title('糖尿病家族史的分布占比', fontsize=16)
plt.ylabel('')
plt.show()

In [37]:
plt.figure(figsize=(10, 6))
plt.scatter(train_data['体重指数'], train_data['胰岛素释放实验'], alpha=0.5, color='orange')
plt.title('体重指数与胰岛素释放实验的关系', fontsize=16)
plt.xlabel('体重指数', fontsize=14)
plt.ylabel('胰岛素释放实验', fontsize=14)
plt.grid(True)
plt.show()

In [38]:
plt.figure(figsize=(10, 6))
plt.scatter(train_data['体重指数'], train_data['肱三头肌皮褶厚度'], alpha=0.5, color='orange')
plt.title('体重指数与肱三头肌皮褶厚度的关系', fontsize=16)
plt.xlabel('体重指数', fontsize=14)
plt.ylabel('肱三头肌皮褶厚度', fontsize=14)
plt.grid(True)
plt.show()