In [3]:
import numpy as np
from FastKNN import KNNInnerProduct
import time

# 1. 定义数据规模
N_TRAIN_SAMPLES = 50000
N_TEST_SAMPLES = 10000
N_FEATURES = 900
K_NEIGHBORS = 100

print("--- 大规模 KNN (内积) 测试 ---")
print(f"训练样本数: {N_TRAIN_SAMPLES}")
print(f"测试样本数: {N_TEST_SAMPLES}")
print(f"特征维度: {N_FEATURES}")
print(f"K 值: {K_NEIGHBORS}")
print("-" * 30)

# 2. 生成训练数据 (两个类别)
print("正在生成训练数据...")
# 创建两个不同的“中心”向量
center_0 = np.random.randn(1, N_FEATURES) * 0.5 + 1.0  # 类别 0 的中心
center_1 = np.random.randn(1, N_FEATURES) * 0.5 - 1.0  # 类别 1 的中心

n_class_0 = N_TRAIN_SAMPLES // 2
n_class_1 = N_TRAIN_SAMPLES - n_class_0

X_train_0 = np.random.randn(n_class_0, N_FEATURES) * 0.5 + center_0
y_train_0 = np.zeros(n_class_0, dtype=int)

X_train_1 = np.random.randn(n_class_1, N_FEATURES) * 0.5 + center_1
y_train_1 = np.ones(n_class_1, dtype=int)

X_train = np.vstack((X_train_0, X_train_1))
y_train = np.hstack((y_train_0, y_train_1))

# 打乱训练数据
indices = np.arange(N_TRAIN_SAMPLES)
np.random.shuffle(indices)
X_train = X_train[indices]
y_train = y_train[indices]

# 3. 生成测试数据
print("正在生成测试数据...")
n_test_0 = N_TEST_SAMPLES // 2
n_test_1 = N_TEST_SAMPLES - n_test_0

X_test_0 = np.random.randn(n_test_0, N_FEATURES) * 0.5 + center_0
y_test_0 = np.zeros(n_test_0, dtype=int)

X_test_1 = np.random.randn(n_test_1, N_FEATURES) * 0.5 + center_1
y_test_1 = np.ones(n_test_1, dtype=int)

X_test = np.vstack((X_test_0, X_test_1))
y_test_true = np.hstack((y_test_0, y_test_1)) # 保存真实标签以供评估

print(f"\n训练数据 X 形状: {X_train.shape}")
print(f"训练数据 y 形状: {y_train.shape}")
print(f"测试数据 X 形状: {X_test.shape}")
print(f"测试数据 y 形状: {y_test_true.shape}")
print("-" * 30)

# 4. 初始化并 "训练" 模型
total_start_time = time.time()

knn = KNNInnerProduct(k=K_NEIGHBORS)
knn.fit(X_train, y_train)

total_end_time = time.time()
print("-" * 30)
print(f"总训练时间: {total_end_time - total_start_time:.4f} 秒")

# 5. 进行预测并计时
print("\n开始预测...")
total_start_time = time.time()

predictions = knn.predict(X_test)

total_end_time = time.time()
print("-" * 30)
print(f"总预测时间: {total_end_time - total_start_time:.4f} 秒")

# 6. 评估结果
accuracy = np.mean(predictions == y_test_true)
print(f"预测准确率: {accuracy * 100:.2f}%")

print("\n前 20 个测试样本的预测结果 (真实 vs 预测):")
print(f"真实: {y_test_true[:20]}")
print(f"预测: {predictions[:20]}")

--- 大规模 KNN (内积) 测试 ---
训练样本数: 50000
测试样本数: 10000
特征维度: 900
K 值: 100
------------------------------
正在生成训练数据...
正在生成测试数据...

训练数据 X 形状: (50000, 900)
训练数据 y 形状: (50000,)
测试数据 X 形状: (10000, 900)
测试数据 y 形状: (10000,)
------------------------------
------------------------------
总训练时间: 0.0015 秒

开始预测...
------------------------------
总预测时间: 34.1468 秒
预测准确率: 100.00%

前 20 个测试样本的预测结果 (真实 vs 预测):
真实: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
预测: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [4]:
from FastKNN import FaissKNNInnerProduct


# 1. 定义数据规模 (与之前完全相同)
N_TRAIN_SAMPLES = 50000
N_TEST_SAMPLES = 10000
N_FEATURES = 900
K_NEIGHBORS = 100

print("--- 大规模 KNN (Faiss IndexFlatIP) 测试 ---")
print(f"训练样本数: {N_TRAIN_SAMPLES}")
print(f"测试样本数: {N_TEST_SAMPLES}")
print(f"特征维度: {N_FEATURES}")
print(f"K 值: {K_NEIGHBORS}")
print("-" * 30)

# 2. 生成训练数据 (确保是 float32)
print("正在生成训练数据 (float32)...")
center_0 = (np.random.randn(1, N_FEATURES) * 0.5 + 1.0).astype(np.float32)
center_1 = (np.random.randn(1, N_FEATURES) * 0.5 - 1.0).astype(np.float32)

n_class_0 = N_TRAIN_SAMPLES // 2
n_class_1 = N_TRAIN_SAMPLES - n_class_0

X_train_0 = (np.random.randn(n_class_0, N_FEATURES) * 0.5 + center_0).astype(np.float32)
y_train_0 = np.zeros(n_class_0, dtype=int)

X_train_1 = (np.random.randn(n_class_1, N_FEATURES) * 0.5 + center_1).astype(np.float32)
y_train_1 = np.ones(n_class_1, dtype=int)

X_train = np.vstack((X_train_0, X_train_1))
y_train = np.hstack((y_train_0, y_train_1))

indices = np.arange(N_TRAIN_SAMPLES)
np.random.shuffle(indices)
X_train = X_train[indices]
y_train = y_train[indices]

# 3. 生成测试数据 (确保是 float32)
print("正在生成测试数据 (float32)...")
n_test_0 = N_TEST_SAMPLES // 2
n_test_1 = N_TEST_SAMPLES - n_test_0

X_test_0 = (np.random.randn(n_test_0, N_FEATURES) * 0.5 + center_0).astype(np.float32)
y_test_0 = np.zeros(n_test_0, dtype=int)

X_test_1 = (np.random.randn(n_test_1, N_FEATURES) * 0.5 + center_1).astype(np.float32)
y_test_1 = np.ones(n_test_1, dtype=int)

X_test = np.vstack((X_test_0, X_test_1))
y_test_true = np.hstack((y_test_0, y_test_1)) 

print(f"\n训练数据 X 形状: {X_train.shape} (Dtype: {X_train.dtype})")
print(f"测试数据 X 形状: {X_test.shape} (Dtype: {X_test.dtype})")
print("-" * 30)

# 4. 初始化 Faiss KNN
knn_faiss = FaissKNNInnerProduct(k=K_NEIGHBORS)

# 5. "训练" (构建索引) 并计时
print("开始 Fit (构建索引)...")
start_fit = time.time()
knn_faiss.fit(X_train, y_train)
end_fit = time.time()
print(f"Faiss Fit (Indexing) 总耗时: {end_fit - start_fit:.4f} 秒")

# 6. 进行预测并计时
print("\n开始 Predict (搜索)...")
total_start_time = time.time()

predictions = knn_faiss.predict(X_test)

total_end_time = time.time()
print("-" * 30)
print(f"Faiss 总预测时间: {total_end_time - total_start_time:.4f} 秒")

# 7. 评估结果
accuracy = np.mean(predictions == y_test_true)
print(f"预测准确率: {accuracy * 100:.2f}%")

print("\n前 20 个测试样本的预测结果 (真实 vs 预测):")
print(f"真实: {y_test_true[:20]}")
print(f"预测: {predictions[:20]}")

--- 大规模 KNN (Faiss IndexFlatIP) 测试 ---
训练样本数: 50000
测试样本数: 10000
特征维度: 900
K 值: 100
------------------------------
正在生成训练数据 (float32)...
正在生成测试数据 (float32)...

训练数据 X 形状: (50000, 900) (Dtype: float32)
测试数据 X 形状: (10000, 900) (Dtype: float32)
------------------------------
开始 Fit (构建索引)...
Faiss: 正在向索引添加数据...
Faiss: 添加了 50000 个向量
Faiss Fit (Indexing) 总耗时: 0.0345 秒

开始 Predict (搜索)...
Faiss: 正在搜索...
Faiss: 搜索完毕，耗时: 0.7468 秒
------------------------------
Faiss 总预测时间: 0.7649 秒
预测准确率: 100.00%

前 20 个测试样本的预测结果 (真实 vs 预测):
真实: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
预测: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
