In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

# 数据预处理函数
def preprocess_database(input_path, output_path):
    # 读取原始数据
    df = pd.read_excel(input_path)
    
    # 处理分类变量（性别）
    df['性别'] = df['性别'].map({'M': 0, 'F': 1})
    
    # 定义需要标准化的数值列（根据实际列名调整）
    numeric_cols = ['性别', '年龄', '眼轴OD', '眼轴OS', '角膜曲率ODK1', '角膜曲率ODK2' , '角膜曲率OSK1', '角膜曲率OSK2', '视力OD', '视力OS']
    
    # 标准化处理
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    # 保存处理后的数据和scaler
    df.to_csv(output_path, index=False)
    return scaler, numeric_cols

# 使用示例（需替换实际路径）
scaler, numeric_cols = preprocess_database('/Users/txh/Desktop/research/lwk/ok/Data_processed/patient_search.xlsx', '/Users/txh/Desktop/research/lwk/ok/Data_processed/processed_data.csv')

class PatientSearch:
    def __init__(self, database_path, scaler, numeric_cols):
        self.df = pd.read_csv(database_path)
        self.feature_cols = numeric_cols
        self.target_col = '镜片参数'
        self.scaler = scaler
        self.numeric_cols = numeric_cols
        
    def _preprocess_input(self, new_patient):
        """预处理新患者数据"""
        # 转换性别
        new_patient['性别'] = 0 if new_patient['性别'] in ['男', 'M'] else 1
        
        # 转换为DataFrame
        input_df = pd.DataFrame([new_patient])
        
        # 标准化数值特征（使用之前保存的scaler）
        input_df[self.numeric_cols] = self.scaler.transform(input_df[self.numeric_cols])
        return input_df
    
    def find_similar(self, new_patient, n=3):
        """查找最相似的n个患者"""
        # 预处理输入数据
        input_df = self._preprocess_input(new_patient)
        
        # 计算距离
        distances = []
        for _, row in self.df.iterrows():
            # 欧氏距离计算
            dist = np.linalg.norm(row[self.feature_cols].values - input_df[self.feature_cols].values)
            distances.append(dist)
        
        # 获取最相似的索引
        self.df['distance'] = distances
        similar_patients = self.df.sort_values('distance').head(n)
        
        return similar_patients[[*self.feature_cols, self.target_col, 'distance']]

# 使用示例
if __name__ == "__main__":
    # 初始化搜索器
    searcher = PatientSearch('/Users/txh/Desktop/research/lwk/ok/Data_processed/processed_data.csv', scaler, numeric_cols)
    
    # 新患者数据示例
    new_patient = {
        '年龄': 8,
        '性别': 'M',
        '角膜曲率ODK1': 40.95,
        '角膜曲率ODK2': 41.95,
        '角膜曲率OSK1': 41.12,
        '角膜曲率OSK2': 42.6,
        '眼轴OD': 24.8,
        '眼轴OS': 24.7,
        '视力OD': 0.8,
        '视力OS': 0.85
    }
    
    # 执行搜索
    results = searcher.find_similar(new_patient)
    
    # 输出结果
    print("最相似的3个患者镜片参数：")
    print(results[['眼轴OD', 	'眼轴OS',	'角膜曲率ODK1',	'角膜曲率ODK2',	'角膜曲率OSK1', 	'角膜曲率OSK2',	'视力OD',	'视力OS','镜片参数', 'distance']])

最相似的3个患者镜片参数：
        眼轴OD      眼轴OS  角膜曲率ODK1  角膜曲率ODK2  角膜曲率OSK1  角膜曲率OSK2      视力OD  \
0   0.490888  0.342411 -1.438627 -1.379425 -1.292703 -0.930676  1.678682   
15  0.851873  0.672430 -0.975526 -1.048676 -1.341937 -1.314732  1.678682   
1   1.138172  0.884586 -1.203514 -1.035955 -1.011367 -0.747792  0.295543   

        视力OS                  镜片参数  distance  
0   1.759549      42.40/-3.00/10.8  0.229938  
15  0.871233  41.00/-3.25/10.8/5.8  1.428331  
1   0.871233      41.75/-6.00/10.7  2.062074  


## 含准确率版本

In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from collections import Counter

# 数据预处理函数
def preprocess_database(input_path, output_path):
    # 读取原始数据
    df = pd.read_excel(input_path)
    
    # 处理分类变量（性别）
    df['性别'] = df['性别'].map({'M': 0, 'F': 1})
    
    # 定义需要标准化的数值列（排除‘镜片参数’）
    numeric_cols = ['性别', '年龄', '眼轴OD', '眼轴OS', '角膜曲率ODK1', '角膜曲率ODK2' , '角膜曲率OSK1', '角膜曲率OSK2', '视力OD', '视力OS']
    
    # 标准化处理
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    # 保存处理后的数据和scaler
    df.to_csv(output_path, index=False)
    return scaler, numeric_cols

# 预处理数据库
scaler, numeric_cols = preprocess_database(
    '/Users/txh/Desktop/research/lwk/ok/Data_processed/patient_search.xlsx', 
    '/Users/txh/Desktop/research/lwk/ok/Data_processed/processed_data.csv'
)

import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score

class PatientSearch:
    def __init__(self, database_path, scaler, numeric_cols):
        self.df = pd.read_csv(database_path)
        self.feature_cols = numeric_cols  # 仅包含数值型特征
        self.target_col = '镜片参数'      # 镜片参数是字符串
        self.scaler = scaler
        self.numeric_cols = numeric_cols

    def _preprocess_input(self, new_patient):
        """预处理新患者数据"""
        new_patient['性别'] = 0 if new_patient['性别'] in ['男', 'M'] else 1
        input_df = pd.DataFrame([new_patient])
        input_df[self.numeric_cols] = self.scaler.transform(input_df[self.numeric_cols])
        return input_df
    
    def find_similar(self, new_patient, n=3):
        """查找最相似的n个患者"""
        input_df = self._preprocess_input(new_patient)
        
        # 计算距离（仅基于数值特征）
        distances = []
        for _, row in self.df.iterrows():
            dist = np.linalg.norm(row[self.feature_cols].values - input_df[self.feature_cols].values)
            distances.append(dist)
        
        # 获取最相似的n个患者
        self.df['distance'] = distances
        similar_patients = self.df.sort_values('distance').head(n)
        
        # ** 方法 1：普通众数投票 **
        predicted_lens = Counter(similar_patients[self.target_col]).most_common(1)[0][0]
        
        # ** 方法 2：加权众数投票 **
        weights = 1 / similar_patients['distance']
        weighted_counts = Counter()
        for lens, weight in zip(similar_patients[self.target_col], weights):
            weighted_counts[lens] += weight
        predicted_weighted_lens = weighted_counts.most_common(1)[0][0]

        # ** 方法 3：Top-K 命中率 **
        top_k_accuracy = 1 if predicted_lens in similar_patients[self.target_col].values else 0
        
        return similar_patients[[*self.feature_cols, self.target_col, 'distance']], predicted_lens, predicted_weighted_lens, top_k_accuracy

    def evaluate_accuracy(self, n=3):
        """评估模型准确率"""
        correct_predictions = 0
        correct_weighted_predictions = 0
        total_cases = len(self.df)
        
        true_labels = []
        predicted_labels = []
        
        for i in range(total_cases):
            test_sample = self.df.iloc[i].to_dict()
            train_df = self.df.drop(index=i)
            self.df = train_df  # 临时去掉该患者
            
            _, predicted_lens, predicted_weighted_lens, top_k_acc = self.find_similar(test_sample, n)
            self.df = pd.read_csv(database_path)  # 恢复数据
            
            # 统计不同计算方式的正确率
            true_labels.append(test_sample[self.target_col])
            predicted_labels.append(predicted_lens)
            
            if predicted_lens == test_sample[self.target_col]:
                correct_predictions += 1
            if predicted_weighted_lens == test_sample[self.target_col]:
                correct_weighted_predictions += 1
        
        overall_accuracy = correct_predictions / total_cases
        weighted_accuracy = correct_weighted_predictions / total_cases
        f1 = f1_score(true_labels, predicted_labels, average='weighted')
        
        return overall_accuracy, weighted_accuracy, f1

# 使用示例
if __name__ == "__main__":
    database_path = '/Users/txh/Desktop/research/lwk/ok/Data_processed/processed_data.csv'
    searcher = PatientSearch(database_path, scaler, numeric_cols)
    
    new_patient = {
        '年龄': 8,
        '性别': 'M',
        '角膜曲率ODK1': 40.95,
        '角膜曲率ODK2': 41.95,
        '角膜曲率OSK1': 41.12,
        '角膜曲率OSK2': 42.6,
        '眼轴OD': 24.8,
        '眼轴OS': 24.7,
        '视力OD': 0.8,
        '视力OS': 0.85
    }
    
    results, predicted_lens, predicted_weighted_lens, top_k_acc = searcher.find_similar(new_patient)
    
    print("最相似的3个患者：")
    print(results)
    print(f"\n预测镜片参数（普通众数）：{predicted_lens}")
    print(f"预测镜片参数（加权投票）：{predicted_weighted_lens}")
    print(f"Top-K 命中率：{top_k_acc:.2f}")

    overall_acc, weighted_acc, f1 = searcher.evaluate_accuracy()
    print(f"普通众数准确率：{overall_acc:.2f}")
    print(f"加权投票准确率：{weighted_acc:.2f}")
    print(f"F1 分数：{f1:.2f}")



最相似的3个患者：
          性别        年龄      眼轴OD      眼轴OS  角膜曲率ODK1  角膜曲率ODK2  角膜曲率OSK1  \
0  -1.137593 -0.848705  0.490888  0.342411 -1.438627 -1.379425 -1.292703   
15 -1.137593 -0.898740  0.851873  0.672430 -0.975526 -1.048676 -1.341937   
1  -1.137593 -0.859427  1.138172  0.884586 -1.203514 -1.035955 -1.011367   

    角膜曲率OSK2      视力OD      视力OS                  镜片参数  distance  
0  -0.930676  1.678682  1.759549      42.40/-3.00/10.8  0.229938  
15 -1.314732  1.678682  0.871233  41.00/-3.25/10.8/5.8  1.428331  
1  -0.747792  0.295543  0.871233      41.75/-6.00/10.7  2.062074  

预测镜片参数（普通众数）：42.40/-3.00/10.8
预测镜片参数（加权投票）：42.40/-3.00/10.8
Top-K 命中率：1.00
普通众数准确率：0.00
加权投票准确率：0.00
F1 分数：0.00
