In [134]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sweetviz as sv
import pandas.util.testing as tm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


%matplotlib inline

In [93]:
import warnings
warnings.filterwarnings("ignore")

In [37]:
#数据处理
def preprocess_1(sr_data):
    normal = [0, 3, 4, 7, 9]  # 标准化处理
    one_hot = [2, 6, 10, 11, 12] # one_hot编码
    binary = [1, 5, 8]
    
    keylist = sr_data.keys()
    new_data = pd.DataFrame()
    for ikey in range(len(keylist)):
        if ikey in normal:
            new_data[keylist[ikey]] = (sr_data[keylist[ikey]] - sr_data[keylist[ikey]].mean()) / sr_data[keylist[ikey]].std()
        elif ikey in one_hot:
            newdata = pd.get_dummies(sr_data[keylist[ikey]], prefix=keylist[ikey])
            new_data = pd.concat([new_data,newdata], axis=1)
        else:
            new_data = pd.concat([new_data, sr_data[keylist[ikey]]], axis=1)
    return new_data

def preprocess_2(sr_data, mode='train'):
    # 标准化处理
    if mode=='train':
        X = sr_data.iloc[:,:-1]
        Y = sr_data.iloc[:,-1]
        rescaled_X = StandardScaler().fit_transform(X)
        Y = Y.to_numpy()
    else:
        X = sr_data
        Y = None
        rescaled_X = StandardScaler().fit_transform(X)
    
    return rescaled_X, Y


In [76]:
SK_LR = False

if SK_LR:
    train_data = pd.read_csv('./data/train.csv', index_col='id')
    # my_report = sv.analyze(train_data)
    # my_report.show_html()
    test_data = pd.read_csv('./data/test.csv', index_col='id')
    # my_report = sv.analyze(test_data)
    # my_report.show_html()
    
    one_hot = False
    if one_hot:
        Data_1 = preprocess_1(train_data).values
        X_train_1 = Data_1[:, :-1]
        Y_train_1 = Data_1[:, -1].reshape(-1, 1)

        sk_lr = LogisticRegression(penalty='l2', tol=10, solver='lbfgs',max_iter=9000)
        sk_lr.fit(X_train_1, Y_train_1.T[0])
        print('系数为：\n', sk_lr.coef_, '\n', sk_lr.intercept_)

        # 由于数据集过小，test处理one-hot过程中缺了两维数据，手工补充
        tmp_df = preprocess_1(test_data)
        tmp_df.insert(11, 'restecg_2', [0]*len(tmp_df))
        tmp_df.insert(23, 'thal_0', [0]*len(tmp_df))
        X_test = tmp_df.values

        
    
    else:
        X_train_2, Y_train_2 = preprocess_2(train_data, 'train')
        
        sk_lr = LogisticRegression(penalty='l2', tol=10, solver='lbfgs',max_iter=9000)
        sk_lr.fit(X_train_2, Y_train_2)
        print('系数为：\n', sk_lr.coef_, '\n', sk_lr.intercept_)
        
        X_test, _ = preprocess_2(test_data, 'test')
        
   
    pre = sk_lr.predict(X_test)

    pre = pd.DataFrame(pre)
    pre = pre.astype('int')
    pre.to_csv('./data/result_1.csv', header=0)
    
# 试验表明，该数据集one-hot编码由于全部标准化,验证结果分别为88.5246和86.8852

In [71]:
# 进行特征提取
Feature_Extraction = False

if Feature_Extraction:
    train_data = pd.read_csv('./data/train.csv', index_col='id')
    X = train_data.iloc[:, :-1]
    Y = train_data.iloc[:, -1]

    K = 10
    select_top_4 = SelectKBest(score_func=chi2, k =K) # 通过卡方检验选择K个得分最高的特征
    fit = select_top_4.fit(X, Y) # 获取特征信息和目标值信息
    features = fit.transform(X) # 特征转换

    # 根据features删去fbs/restecg/thal3列
    train_data = train_data.drop(['fbs', 'restecg', 'thal'], axis=1)
    X_train_2, Y_train_2 = preprocess_2(train_data, 'train')

    sk_lr = LogisticRegression(penalty='l2', tol=10, solver='lbfgs',max_iter=9000)
    sk_lr.fit(X_train_2, Y_train_2)
    print('系数为：\n', sk_lr.coef_, '\n', sk_lr.intercept_)

    test_data = pd.read_csv('./data/test.csv', index_col='id')
    test_data = test_data.drop(['fbs', 'restecg', 'thal'], axis=1)
    X_test, _ = preprocess_2(test_data, 'test')
    pre = sk_lr.predict(X_test)

    pre = pd.DataFrame(pre)
    pre = pre.astype('int')
    pre.to_csv('./data/result_2.csv', header=0)

# 仅选择10个特征、数据全部标准化准确率为80.3279

In [102]:
# 多模型比较
models = []
models.append(("LR", LogisticRegression())) #逻辑回归
models.append(("NB", GaussianNB())) # 高斯朴素贝叶斯
models.append(("KNN", KNeighborsClassifier())) #K近邻分类
models.append(("DT", DecisionTreeClassifier())) #决策树分类
models.append(("SVM", SVC())) # 支持向量机分类

train_data = pd.read_csv('./data/train.csv', index_col='id')
train_data = preprocess_1(train_data)
X = train_data.iloc[:, :-1]
Y = train_data.iloc[:, -1]
# X_train,X_test,Y_train,Y_test = train_test_split(X,Y, random_state = 22, test_size = 0.2)

names = []
results = []
for name, model in models:
    kflod = KFold(n_splits=10, random_state=22)
    cv_result = cross_val_score(model, X, Y, cv = kflod, scoring="accuracy")
    names.append(name)
    results.append(cv_result)

for i in range(len(names)):
    print(names[i], results[i].mean())

In [109]:
# 多模型投票
test_data = pd.read_csv('./data/test.csv', index_col='id')

tmp_df = preprocess_1(test_data)
tmp_df.insert(11, 'restecg_2', [0]*len(tmp_df))
tmp_df.insert(23, 'thal_0', [0]*len(tmp_df))
X_test = tmp_df

for name, model in models:
    if name == 'LR':
        lr = LogisticRegression()
        lr.fit(X, Y)
        pre_lr = lr.predict(X_test)
    
    elif name == 'NB':
        nb = GaussianNB()
        nb.fit(X, Y)
        pre_nb = nb.predict(X_test)
    
    elif name == 'KNN':
        knn = KNeighborsClassifier()
        knn.fit(X, Y)
        pre_knn = knn.predict(X_test)
        
    elif name == 'DT':
        dt = DecisionTreeClassifier()
        dt.fit(X, Y)
        pre_dt = dt.predict(X_test)
    
    elif name == 'SVM':
        svm = SVC()
        svm.fit(X, Y)
        pre_svm = svm.predict(X_test)
        
predictions = [pre_lr, pre_nb, pre_knn, pre_dt, pre_svm]
results = np.concatenate((predictions[0].reshape(-1,1), predictions[1].reshape(-1,1), \
                      predictions[2].reshape(-1,1), predictions[3].reshape(-1,1)), axis=1)


answer = []
for i in range(results.shape[0]):
#     ans = np.argmax(np.bincount(results[i]))
    ans = np.argmax(np.bincount(results[i])) if np.bincount(results[i])[0] != 2 else 3
    answer.append(ans)

answer = pd.DataFrame(answer)
answer.to_csv('./data/answer.csv', header=0)

In [183]:
train_data

Unnamed: 0_level_0,age,sex,cp_0,cp_1,cp_2,cp_3,trestbps,chol,fbs,restecg_0,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_1,thal_2,thal_3,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-1.315229,0,0,0,1,0,-0.570353,-0.633078,0,0,...,0,1,0,0,0,0,0,1,0,1
1,1.255553,0,0,0,1,0,0.842187,0.583739,0,1,...,0,0,1,0,0,0,0,1,0,1
2,0.361368,1,0,0,1,0,0.489052,-0.670519,1,1,...,1,1,0,0,0,0,0,1,0,1
3,0.920234,0,1,0,0,0,-0.452641,-0.932603,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0.249595,1,0,1,0,0,1.313034,-0.277393,0,1,...,1,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,-1.315229,1,1,0,0,0,0.018205,0.003411,1,1,...,0,0,0,0,0,1,0,0,1,0
238,-0.085724,1,1,0,0,0,-0.688065,-1.101085,0,0,...,0,0,1,0,0,0,0,0,1,0
239,0.137822,1,0,0,0,1,-0.688065,-1.007484,0,1,...,0,1,0,0,0,0,0,0,1,1
240,-0.868136,1,0,0,1,0,0.371341,0.190614,0,1,...,1,1,0,0,0,0,0,1,0,1


In [182]:
# 移除train中的可疑数据
train_data = pd.read_csv('./data/train.csv', index_col='id')
train_data = preprocess_1(train_data)
X_train = train_data.iloc[:, :-1]
Y_train = train_data.iloc[:, -1]

test_data = pd.read_csv('./data/test.csv', index_col='id')
X_test = preprocess_1(test_data)

sk_lr = LogisticRegression(penalty='l2', tol=10, solver='lbfgs',max_iter=9000)
sk_lr.fit(X_train, Y_train)

pre = sk_lr.predict(X_test)

pre = pd.DataFrame(pre)
pre = pre.astype('int')
pre.to_csv('./data/answer_1.csv', header=0)

# 结果没影响