**Q2 SVM&LDA**

*采用支持向量机对男女生样本数据中的（身高、体重、50m 成绩、肺活量）共 4 个特征进行分类；  
实现 LDA 算法对前述4个特征进行分类，计算模型预测性能（包含 SE、SP、ACC 和 AUC），   
试分析 LDA 算法如果作为降维技术对于各性能指标的影响。*

`1.导入模块`

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn import svm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import os

`2.处理数据`

In [None]:
def pre_data(file_path):
    data = pd.read_excel(file_path)
    data.dropna(axis=0, how='any', subset=['肺活量'], inplace=True)
    print(data)
    return data

data = pre_data(os.path.join('data', '作业数据_2021合成.xls'))

`3.初始化`

In [4]:
def __init__(self):
    self.se_svm = []
    self.sp_svm = []
    self.acc_svm = []
    self.auc_svm = []
    self.se_lda = []
    self.sp_lda = []
    self.acc_lda = []
    self.auc_lda = []

`4.评估函数`

In [8]:
def eval(y_pred,y_true,label=1):
    confusion_matrix=metrics.confusion_matrix(y_true,y_pred)
    FP = confusion_matrix .sum(axis=0) - np.diag(confusion_matrix )
    FN = confusion_matrix .sum(axis=1) - np.diag(confusion_matrix )
    TP = np.diag(confusion_matrix )
    TN = confusion_matrix .sum() - (FP + FN + TP)
    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)
    SE = TP/(TP+FN)   # Sensitivity/ hit rate/ recall/ true positive rate
    SP = TN/(TN+FP)   # Specificity/ true negative rate  SP
    ACC_all=(TP+TN)/(FP+FN+TP+TN)
    return SE[label],SP[label],ACC_all[label],roc_auc_score(y_true,y_pred)

`5.SVM分类`

In [7]:
def mySVM(self,data):

    kf = KFold(n_splits = 5,shuffle = True,random_state = None)
    for train_id, test_id in kf.split(data):
        # print("TRAIN:", train_id, "TEST:", test_id)
        x_train =np.array([data["身高"].iloc[train_id],data["体重"].iloc[train_id],
                data["50米成绩"].iloc[train_id],data["肺活量"].iloc[train_id]], dtype='float32')
        x_test =np.array([data["身高"].iloc[test_id],data["体重"].iloc[test_id],
                data["50米成绩"].iloc[test_id],data["肺活量"].iloc[test_id]], dtype='float32')
        y_train, y_test =np.array(data["性别男1女0"].iloc[train_id]),np.array(data["性别男1女0"].iloc[test_id], dtype='float32')
        x_train=np.transpose(x_train);x_test=np.transpose(x_test)
        print(len(x_train), len(x_test), len(y_train), len(y_test))
        svm = svm.SVC(kernel='linear', probability=True, random_state=None)
        svm.fit(x_train, y_train)
        svm_pred=svm.predict(x_test)
        se,sp,acc,auc = eval(svm_pred,y_test)
        self.se_svm.append(se);self.sp_svm.append(sp);self.acc_svm.append(acc);self.auc_svm.append(auc)
    return self.se_svm, self.sp_svm, self.acc_svm, self.auc_svm

    mySVM(data)

`6.LDA分类` 

In [6]:
def myLDA(self,data,n):

    kf = KFold(n_splits = 5,shuffle = True,random_state = None)
    for train_id, test_id in kf.split(data):
        # print("TRAIN:", train_id, "TEST:", test_id)
        x_train =np.array([data["身高"].iloc[train_id],data["体重"].iloc[train_id],
                data["50米成绩"].iloc[train_id],data["肺活量"].iloc[train_id]])
        x_test =np.array([data["身高"].iloc[test_id],data["体重"].iloc[test_id],
                data["50米成绩"].iloc[test_id],data["肺活量"].iloc[test_id]])
        y_train, y_test =np.array(data["性别男1女0"].iloc[train_id]),np.array(data["性别男1女0"].iloc[test_id])
        x_train=np.transpose(x_train);x_test=np.transpose(x_test)
        print(len(x_train), len(x_test), len(y_train), len(y_test))
        lda = LDA(n_components=n)
        lda.fit(x_train, y_train)
        lda_pred=lda.predict(x_test)
        se,sp,acc,auc = eval(lda_pred,y_test)
        self.se_lda.append(se);self.sp_lda.append(sp);self.acc_lda.append(acc);self.auc_lda.append(auc)
    return self.se_lda, self.sp_lda, self.acc_lda, self.auc_lda


`5.可视化`

In [None]:
def vitualize(clf):
    return

vitualize(clf)