In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from collections import Counter

In [3]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', normalize=False):
        self.k = k
        self.distance_metric = distance_metric
        self.normalize = normalize
        self.X_train = None
        self.y_train = None
        self.scaler = None
        self.classes = None
        self.class_to_index = None

    def fit(self, X, y):
        if self.normalize:
            self.scaler = StandardScaler()
            X = self.scaler.fit_transform(X)
            print("训练数据已标准化。")
        self.X_train = X
        self.y_train = y
        self.classes = np.unique(y)
        self.class_to_index = {cls: idx for idx, cls in enumerate(self.classes)}
        print("KNN模型已拟合训练数据。")
        print(f"类别标签: {self.classes}")

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X1[:, np.newaxis] - X2) ** 2, axis=2))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X1[:, np.newaxis] - X2), axis=2)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")
        return distances

    def predict(self, X, batch_size=100):
        if self.X_train is None or self.y_train is None:
            raise ValueError("KNN模型尚未拟合任何数据，请先调用fit方法。")

        if self.normalize:
            if self.scaler is None:
                raise ValueError("缩放器未初始化，请确保在拟合时设置了normalize=True。")
    
        n_samples = X.shape[0]
        predictions = []
    
        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            X_batch = X[start:end]
        
            if self.normalize:
                X_batch = self.scaler.transform(X_batch)
        
            distances = self.compute_distance(X_batch, self.X_train)
            nearest_neighbor_ids = np.argsort(distances, axis=1)[:, :self.k]
            nearest_neighbor_labels = self.y_train[nearest_neighbor_ids]
            batch_predictions = np.array([Counter(row).most_common(1)[0][0] for row in nearest_neighbor_labels])
        
            predictions.append(batch_predictions)
        predictions = np.concatenate(predictions)
        return predictions

    def predict_proba(self, X):
        if self.X_train is None or self.y_train is None:
            raise ValueError("KNN模型尚未拟合任何数据，请先调用fit方法。")

        if self.normalize:
            if self.scaler is None:
                raise ValueError("缩放器未初始化，请确保在拟合时设置了normalize=True。")
            X = self.scaler.transform(X)
            print("测试数据已标准化。")

        distances = self.compute_distance(X, self.X_train)
        nearest_neighbor_ids = np.argsort(distances, axis=1)[:, :self.k]
        nearest_neighbor_labels = self.y_train[nearest_neighbor_ids]
        n_samples = X.shape[0]
        n_classes = len(self.classes)
        proba = np.zeros((n_samples, n_classes))

        for i, neighbors in enumerate(nearest_neighbor_labels):
            counts = Counter(neighbors)
            for cls, cnt in counts.items():
                if cls in self.class_to_index:
                    proba[i, self.class_to_index[cls]] = cnt / self.k
        return proba

In [9]:
# Define data preprocessing function
def preprocess_data(train_path, test_path,random_state=41,ratio = 0.2):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    #['id','CustomerId','Surname','Gender','Geography']
    train_data = train_data.drop(['Surname','Gender','Geography'],axis=1)
    test_data = test_data.drop(['Surname','Gender','Geography'],axis=1)
    if train_data.isnull().sum().sum() > 0:
        train_data = train_data.fillna(method='ffill') 

    if test_data.isnull().sum().sum() > 0:
        test_data = test_data.fillna(method='ffill')  

    X = train_data.iloc[:, :-1].values
    y = train_data.iloc[:, -1].values

    if y.dtype == object or y.dtype == str:
        le = LabelEncoder()
        y = le.fit_transform(y)
        print("标签已编码为整数。")

    X_test = test_data.values  

    from sklearn.model_selection import train_test_split
    X_train, X_val, y_train, y_val = train_test_split(
        X,
        y,
        test_size=ratio,
        random_state=random_state,
        shuffle=True,
        stratify=y  
    )
    print(f"train set: {X_train.shape[0]} ")
    print(f"val set: {X_val.shape[0]} ")
    print(f"test set: {X_test.shape[0]} ")

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    print("normalize complete")

    return X_train, X_val, X_test, y_train, y_val, test_data, scaler

In [6]:
def cross_validate(X, y, knn, n_splits=5,random_state=41):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    metrics_scores = {
        'roc_auc': [],
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1_score': []
    }

    for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):

        X_train_fold, X_val_fold = X[train_index], X[val_index]
        y_train_fold, y_val_fold = y[train_index], y[val_index]

        knn_fold = KNN(
            k=knn.k,
            distance_metric=knn.distance_metric,
            normalize=knn.normalize
        )

        knn_fold.fit(X_train_fold, y_train_fold)

        y_val_proba = knn_fold.predict_proba(X_val_fold)

        y_val_pred = knn_fold.predict(X_val_fold)

        if len(np.unique(y)) == 2:
            positive_class = knn_fold.classes[1]
            y_val_score = y_val_proba[:, knn_fold.class_to_index[positive_class]]
            roc_auc = roc_auc_score(y_val_fold, y_val_score)
        else:
            roc_auc = roc_auc_score(y_val_fold, y_val_proba, multi_class='ovr')

        accuracy = accuracy_score(y_val_fold, y_val_pred)
        precision = precision_score(y_val_fold, y_val_pred, average='macro', zero_division=0)
        recall = recall_score(y_val_fold, y_val_pred, average='macro', zero_division=0)
        f1 = f1_score(y_val_fold, y_val_pred, average='macro', zero_division=0)

        # print(f"第 {fold} 折的ROC AUC分数: {roc_auc:.4f}")
        # print(f"第 {fold} 折的准确率: {accuracy:.4f}")
        # print(f"第 {fold} 折的精确率: {precision:.4f}")
        # print(f"第 {fold} 折的召回率: {recall:.4f}")
        # print(f"第 {fold} 折的F1分数: {f1:.4f}")

        metrics_scores['roc_auc'].append(roc_auc)
        metrics_scores['accuracy'].append(accuracy)
        metrics_scores['precision'].append(precision)
        metrics_scores['recall'].append(recall)
        metrics_scores['f1_score'].append(f1)

    mean_metrics = {metric: np.mean(scores) for metric, scores in metrics_scores.items()}
    # print("\n交叉验证的平均分数:")
    # for metric, score in mean_metrics.items():
    #     print(f"{metric.capitalize()}: {score:.4f}")

    return mean_metrics, metrics_scores


In [28]:
# Load and preprocess data
X_train, X_val, X_test, y_train, y_val, test_data, scaler = preprocess_data('./train.csv', './test.csv',random_state= 50,ratio=0.4)
# data1 = train['CustomerId'].values
# data2 = X_test['CustomerId'].values
# set1 = set([i for i in data1])
# set2 = set([i for i in data2])
# print(set1&set2)
# Create and evaluate model
for i in range(3,50):
    print("\n聚类数目为： ",i)
    knn = KNN(k=i, distance_metric='euclidean', normalize=False)  # 已在preprocess_data中进行了标准化

    # mean_metrics, metrics_scores = cross_validate(X_train, y_train, knn, n_splits=5)
    # for metric, scores in metrics_scores.items():
    #     print(f"{metric.capitalize()}: {scores}")
    # print(f"\n平均各项指标分数: {mean_metrics}")

    # print("\n拟合整个训练数据集...")
    knn.fit(X_train, y_train)

    # print("\n在验证集上进行预测...")
    y_pred = knn.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_val, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_val, y_pred, average='macro', zero_division=0)

    print(f"验证集准确率: {accuracy * 100:.2f}%")
    print(f"验证集精确率: {precision:.4f}")
    print(f"验证集召回率: {recall:.4f}")
    print(f"验证集F1分数: {f1:.4f}")

    # 计算验证集的ROC AUC
    if len(np.unique(y_train)) == 2:
        positive_class = knn.classes[1]
        y_val_proba = knn.predict_proba(X_val)[:, knn.class_to_index[positive_class]]
        auc = roc_auc_score(y_val, y_val_proba)
    else:
        y_val_proba = knn.predict_proba(X_val)
        auc = roc_auc_score(y_val, y_val_proba, multi_class='ovr')
    print(f"验证集ROC AUC分数: {auc:.4f}")
    

# 
# # TODO: hyperparamters tuning
# 
# 
# # TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
# knn = ...
# knn.fit(X, y)
# test_predictions = knn.predict(X_test)
# 
# # Save test predictions
# pd.DataFrame({'id': pd.read_csv('/path/of/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

训练集大小: 9000 条记录。
验证集大小: 6000 条记录。
测试集大小: 10000 条记录。
归一化处理已完成。

聚类数目为：  3
KNN模型已拟合训练数据。
类别标签: [0. 1.]
验证集准确率: 85.75%
验证集精确率: 0.7866
验证集召回率: 0.7448
验证集F1分数: 0.7623
验证集ROC AUC分数: 0.8311

聚类数目为：  4
KNN模型已拟合训练数据。
类别标签: [0. 1.]
验证集准确率: 85.82%
验证集精确率: 0.7893
验证集召回率: 0.7419
验证集F1分数: 0.7612
验证集ROC AUC分数: 0.8504

聚类数目为：  5
KNN模型已拟合训练数据。
类别标签: [0. 1.]
验证集准确率: 86.62%
验证集精确率: 0.8076
验证集召回率: 0.7472
验证集F1分数: 0.7709
验证集ROC AUC分数: 0.8620

聚类数目为：  6
KNN模型已拟合训练数据。
类别标签: [0. 1.]
验证集准确率: 86.62%
验证集精确率: 0.8070
验证集召回率: 0.7484
验证集F1分数: 0.7716
验证集ROC AUC分数: 0.8708

聚类数目为：  7
KNN模型已拟合训练数据。
类别标签: [0. 1.]
验证集准确率: 87.13%
验证集精确率: 0.8204
验证集召回率: 0.7507
验证集F1分数: 0.7774
验证集ROC AUC分数: 0.8761

聚类数目为：  8
KNN模型已拟合训练数据。
类别标签: [0. 1.]
验证集准确率: 87.07%
验证集精确率: 0.8195
验证集召回率: 0.7491
验证集F1分数: 0.7759
验证集ROC AUC分数: 0.8782

聚类数目为：  9
KNN模型已拟合训练数据。
类别标签: [0. 1.]
验证集准确率: 87.25%
验证集精确率: 0.8247
验证集召回率: 0.7496
验证集F1分数: 0.7778
验证集ROC AUC分数: 0.8830

聚类数目为：  10
KNN模型已拟合训练数据。
类别标签: [0. 1.]
验证集准确率: 87.30%
验证集精确率: 0.8270
验证集召回率: 0.7487
验证集F1分

In [14]:
X_train, X_val, X_test, y_train, y_val, test_data, scaler = preprocess_data('./train.csv', './test.csv',random_state= 50,ratio=0.001)
knn  = KNN(k=23, distance_metric='euclidean', normalize=False)
knn.fit(X_train, y_train)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('./test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

KNN模型已拟合训练数据。
类别标签: [0. 1.]
