In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [3]:
df = pd.read_csv('/data/kidney_disease.csv')

In [4]:
df.drop(columns=['id'], inplace=True)  

df.replace('?', np.nan, inplace=True)  


for col in df.columns:
    df[col] = df[col].astype(str).str.strip()

df['classification'] = df['classification'].map({'ckd': 1, 'notckd': 0})

In [5]:
X = df.drop(columns=['classification'])
y = df['classification']


imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

In [6]:
X_encoded = pd.DataFrame(X_imputed, columns=X.columns)

for col in X_encoded.columns:
    if X_encoded[col].dtype == 'object':
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col])

In [7]:
 sscaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [8]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Support Vector Machine": SVC(),
    "Naive Bayes": GaussianNB(),
    "Multilayer Perceptron": MLPClassifier(max_iter=1000, random_state=42)
}

In [9]:
results = []
for name, model in models.items():
    print(f"训练模型：{name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"{name} -- 准确率: {acc:.4f}, 精确率: {prec:.4f}, 召回率: {rec:.4f}, F1分数: {f1:.4f}\n")

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1
    })

训练模型：Logistic Regression
Logistic Regression -- 准确率: 0.9833, 精确率: 0.9868, 召回率: 0.9868, F1分数: 0.9868

训练模型：Random Forest
Random Forest -- 准确率: 1.0000, 精确率: 1.0000, 召回率: 1.0000, F1分数: 1.0000

训练模型：Support Vector Machine
Support Vector Machine -- 准确率: 0.9750, 精确率: 0.9620, 召回率: 1.0000, F1分数: 0.9806

训练模型：Naive Bayes
Naive Bayes -- 准确率: 0.9000, 精确率: 0.9444, 召回率: 0.8947, F1分数: 0.9189

训练模型：Multilayer Perceptron
Multilayer Perceptron -- 准确率: 0.9917, 精确率: 1.0000, 召回率: 0.9868, F1分数: 0.9934



In [10]:
results_df = pd.DataFrame(results)
print("所有模型性能对比：")
print(results_df)

所有模型性能对比：
                    Model  Accuracy  Precision    Recall  F1 Score
0     Logistic Regression  0.983333   0.986842  0.986842  0.986842
1           Random Forest  1.000000   1.000000  1.000000  1.000000
2  Support Vector Machine  0.975000   0.962025  1.000000  0.980645
3             Naive Bayes  0.900000   0.944444  0.894737  0.918919
4   Multilayer Perceptron  0.991667   1.000000  0.986842  0.993377


In [11]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier

lr = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(random_state=42)
svm = SVC(probability=True)

In [13]:
voting_clf = VotingClassifier(
    estimators=[('lr', lr), ('rf', rf), ('svm', svm)],
    voting='soft'
)

In [14]:
bagging_clf = BaggingClassifier(estimator=lr, n_estimators=10, random_state=42)


ensemble_models_discriminative = {
    "Voting (LR+RF+SVM)": voting_clf,
    "Bagging (Logistic Regression)": bagging_clf
}

In [15]:
for name, model in ensemble_models_discriminative.items():
    print(f"训练集成模型：{name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"{name} -- 准确率: {acc:.4f}, 精确率: {prec:.4f}, 召回率: {rec:.4f}, F1分数: {f1:.4f}\n")

训练集成模型：Voting (LR+RF+SVM)
Voting (LR+RF+SVM) -- 准确率: 0.9917, 精确率: 0.9870, 召回率: 1.0000, F1分数: 0.9935

训练集成模型：Bagging (Logistic Regression)
Bagging (Logistic Regression) -- 准确率: 0.9750, 精确率: 0.9740, 召回率: 0.9868, F1分数: 0.9804



In [16]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [17]:
voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svm', SVC(probability=True))  # 需要设置probability=True
    ],
    voting='soft'
)
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)
print("Voting (Discriminative) F1:", f1_score(y_test, y_pred))


Voting (Discriminative) F1: 0.9934640522875817


In [18]:
bagging_lr = BaggingClassifier(
    estimator=LogisticRegression(max_iter=1000),
    n_estimators=10,
    random_state=42
)
bagging_lr.fit(X_train, y_train)
y_pred = bagging_lr.predict(X_test)
print("Bagging (Logistic) F1:", f1_score(y_test, y_pred))


Bagging (Logistic) F1: 0.9803921568627451


In [19]:
voting_generative = VotingClassifier(
    estimators=[
        ('nb', GaussianNB()),
        ('mlp', MLPClassifier(max_iter=1000, random_state=42))
    ],
    voting='soft'
)
voting_generative.fit(X_train, y_train)
y_pred = voting_generative.predict(X_test)
print("Voting (Generative) F1:", f1_score(y_test, y_pred))


Voting (Generative) F1: 0.9261744966442953


In [20]:
stacking_generative = StackingClassifier(
    estimators=[
        ('nb', GaussianNB()),
        ('mlp', MLPClassifier(max_iter=1000, random_state=42))
    ],
    final_estimator=RandomForestClassifier(random_state=42)
)
stacking_generative.fit(X_train, y_train)
y_pred = stacking_generative.predict(X_test)
print("Stacking (Generative) Recall:", recall_score(y_test, y_pred))


Stacking (Generative) Recall: 1.0


Question II:
Example of Reinforcement Learning: Robotaxi

Think about a self-driving taxi, called a Robotaxi, learning to drive in a city. The agent is the Robotaxi, with sensors like cameras and LIDAR to see the road, other cars, pedestrians, and traffic lights.

It can do actions like accelerate, brake, turn left/right, or stop.
The reward system could look like this:

+10 points for safely dropping off a passenger at their destination
-20 points for causing or being involved in an accident
+5 points for following traffic rules correctly (like stopping at red lights)
-10 points for running a red light or speeding
+2 points for smooth driving that saves battery/fuel
+3 points for picking up a new passenger quickly

The Robotaxi learns from these rewards and tries to maximize its total points. If it speeds or breaks rules, it loses points, so it learns to drive safely and efficiently to get the best score.