In [44]:
import pandas as pd
import csv
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from nonconformist.cp import IcpClassifier
from nonconformist.nc import NcFactory
import torch
from skorch import NeuralNetClassifier
from torch import nn

In [45]:
# 读取数据
rootdir = os.getcwd()
data = pd.read_csv(os.path.join(rootdir, '银行客户数据.csv'))

# 特征列表
features = ['Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

# 处理异常值（例如，非常高或非常低的CreditScore）
q_low = data['CreditScore'].quantile(0.01)
q_hi = data['CreditScore'].quantile(0.99)
data_filtered = data[(data['CreditScore'] > q_low) & (data['CreditScore'] < q_hi)]

# 数据预处理
numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
numeric_transformer = StandardScaler()

categorical_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
categorical_transformer = OneHotEncoder(drop='first')

In [98]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Load Data
rootdir = os.getcwd()
data = pd.read_csv(os.path.join(rootdir, '银行客户数据.csv'))

# Process Data
q_low = data['CreditScore'].quantile(0.01)
q_hi = data['CreditScore'].quantile(0.99)
data_filtered = data[(data['CreditScore'] > q_low) & (data['CreditScore'] < q_hi)]

numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
categorical_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X = preprocessor.fit_transform(data_filtered.drop('Exited', axis=1))
y = data_filtered['Exited'].values

# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# DataLoader for batch processing
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

# Neural Network Model
class BankCustomerNet(nn.Module):
    def __init__(self):
        super(BankCustomerNet, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 2)  # 修改输出单元数为2
        self.relu = nn.ReLU()
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.log_softmax(x)
        return x

# Instantiate model, loss function, and optimizer
model = BankCustomerNet()
criterion = nn.NLLLoss()  # Negative Log Likelihood Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    for inputs, labels in train_loader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(X_test)
    predicted = torch.argmax(outputs, dim=1)
    accuracy = (predicted.eq(y_test).sum() / y_test.size(0)).item()
    print(f'Accuracy: {accuracy:.4f}')

Epoch [1/10], Loss: 0.4368
Epoch [2/10], Loss: 0.3745
Epoch [3/10], Loss: 0.3160
Epoch [4/10], Loss: 0.1785
Epoch [5/10], Loss: 0.3489
Epoch [6/10], Loss: 0.4547
Epoch [7/10], Loss: 0.3133
Epoch [8/10], Loss: 0.3275
Epoch [9/10], Loss: 0.2448
Epoch [10/10], Loss: 0.3282
Accuracy: 0.8579


In [102]:
import torch
from torchcp.classification.scores import THR, APS, SAPS, RAPS
from torchcp.classification.predictors import SplitPredictor, ClusterPredictor, ClassWisePredictor

# Set up for Conformal Prediction
weight_for_saps = 1.0
penalty = 0.5
score_functions = [THR(), APS(), SAPS(weight=weight_for_saps), RAPS(penalty)]
predictors = [SplitPredictor, ClusterPredictor, ClassWisePredictor] # Assume ClusterPredictor is implemented

performance_results = []

# Calibrate Predictors and Evaluate Performance
for score_function in score_functions:
    for Predictor in predictors:
        predictor = Predictor(score_function=score_function, model=model)
        cal_dataloader = DataLoader(TensorDataset(X_train, y_train), batch_size=64, shuffle=True)

        predictor.calibrate(cal_dataloader, alpha=0.1)

        test_dataloader = DataLoader(TensorDataset(X_test, y_test), batch_size=64, shuffle=False)
        result_dict = predictor.evaluate(test_dataloader)
        coverage_rate, average_size = result_dict["Coverage_rate"], result_dict["Average_size"]
        
        performance_results.append((score_function.__class__.__name__, Predictor.__name__, coverage_rate, average_size))

# Print Performance Results
for result in performance_results:
    print(f"Score Function: {result[0]}, Predictor: {result[1]}, Coverage Rate: {result[2]:.4f}, Average Set Size: {result[3]:.4f}")

Score Function: THR, Predictor: SplitPredictor, Coverage Rate: 0.9003, Average Set Size: 1.1021
Score Function: THR, Predictor: ClusterPredictor, Coverage Rate: 0.8998, Average Set Size: 1.0988
Score Function: THR, Predictor: ClassWisePredictor, Coverage Rate: 0.8951, Average Set Size: 1.2860
Score Function: APS, Predictor: SplitPredictor, Coverage Rate: 0.9008, Average Set Size: 1.2140
Score Function: APS, Predictor: ClusterPredictor, Coverage Rate: 0.8871, Average Set Size: 1.2041
Score Function: APS, Predictor: ClassWisePredictor, Coverage Rate: 0.9059, Average Set Size: 1.4567
Score Function: SAPS, Predictor: SplitPredictor, Coverage Rate: 0.8946, Average Set Size: 1.1613
Score Function: SAPS, Predictor: ClusterPredictor, Coverage Rate: 0.9026, Average Set Size: 1.1839
Score Function: SAPS, Predictor: ClassWisePredictor, Coverage Rate: 0.8890, Average Set Size: 1.5320
Score Function: RAPS, Predictor: SplitPredictor, Coverage Rate: 0.8989, Average Set Size: 1.1176
Score Function: RA

对于纯数据的二分类预测问题，需要仔细调试使得logit和label的输出匹配，另外在神经网络的输出需要用softmax函数处理