In [1]:
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

current_dir = '/Users/4paradigm/workspace/python/MLOpsInAction/mlflow/disease'
file_path=os.path.join(current_dir,'data/heart_disease_uci.csv')

heart_disease = pd.read_csv(file_path)
heart_disease.head(5)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [2]:
# 数据预处理-清除无用
heart_disease.drop(['id', 'dataset'], axis=1, inplace=True)
heart_disease.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [3]:
# 数据预处理-缺失值
heart_disease["trestbps"].fillna(heart_disease["trestbps"].mean(), inplace=True)
heart_disease["chol"].fillna(heart_disease["chol"].mean(), inplace=True)
heart_disease["fbs"].fillna(heart_disease["fbs"].mode()[0], inplace=True)
heart_disease["restecg"].fillna(heart_disease["restecg"].mode()[0], inplace=True)
heart_disease["thalch"].fillna(heart_disease["thalch"].mean(), inplace=True)
heart_disease["exang"].fillna(heart_disease["exang"].mode()[0], inplace=True)
heart_disease["oldpeak"].fillna(heart_disease["oldpeak"].mean(), inplace=True)
heart_disease["slope"].fillna(heart_disease["slope"].mode()[0], inplace=True)
heart_disease["ca"].fillna(heart_disease["ca"].mean(), inplace=True)
heart_disease["thal"].fillna(heart_disease["thal"].mode()[0], inplace=True)

heart_disease.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [5]:
# 数据特征-onehot编码非数值化列
import numpy as np
cat_features = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(heart_disease[cat_features].values)

matrix = enc.transform(heart_disease[cat_features].values).toarray()
feature_labels = np.array(enc.categories_).ravel()

col_names = []
for col in cat_features:
  for val in heart_disease[col].unique():
    col_names.append("{}_{}".format(col, val))

onehot_pdf=pd.DataFrame(data = matrix, columns=col_names, dtype=int)  
onehot_pdf.head(5)

  feature_labels = np.array(enc.categories_).ravel()


Unnamed: 0,sex_Male,sex_Female,cp_typical angina,cp_asymptomatic,cp_non-anginal,cp_atypical angina,fbs_True,fbs_False,restecg_lv hypertrophy,restecg_normal,restecg_st-t abnormality,exang_False,exang_True,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
0,0,1,0,0,0,1,0,1,1,0,0,1,0,1,0,0,1,0,0
1,0,1,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0
2,0,1,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
3,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0
4,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0


In [7]:
# 数据存储
remian_features=['age','trestbps','chol','thalch','oldpeak','ca','num']
remain_pdf=heart_disease[remian_features]
final_pdf=pd.concat([onehot_pdf, remain_pdf], axis=1)

final_pdf.head()
csv_file_path=os.path.join(current_dir,'data/data.csv')
final_pdf.to_csv(csv_file_path)

In [9]:
data=final_pdf.drop('num', axis=1)
label=final_pdf['num']

x=data.values
y=label.values
x.shape,y.shape

((920, 25), (920,))

In [10]:
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

# 数据拆分
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# 模型训练
lin_model = RidgeClassifier()
lin_model.fit(x_train, y_train)

# 模型评估
y_preds = lin_model.predict(x_test)

report=classification_report(y_test, y_preds)



print(report)

              precision    recall  f1-score   support

           0       0.65      0.85      0.74        78
           1       0.40      0.40      0.40        63
           2       0.06      0.05      0.06        20
           3       0.00      0.00      0.00        19
           4       0.00      0.00      0.00         4

    accuracy                           0.50       184
   macro avg       0.22      0.26      0.24       184
weighted avg       0.42      0.50      0.46       184



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
