Предсказание риска возникновения сердечных заболеваний

В данной работе представлена модель предсказания риска возникновения сердечных заболеваний.
Предсказание основывается на следующий показателях:

'age' - возвраст
'gender' - пол
'height' - рост
'weight' - вес
'ap_hi' - верхнее давление
'ap_lo' - нижнее давление
'cholesterol' - холестерин
'gluc' - глюкоза
'smoke' - курение
'alco' - алкоголь
'active' - занятия спортом

In [32]:
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from pickle import dump, load

In [7]:
try:
    df_train = pd.read_csv('train.csv')
except:
    print('При чтение файла возникал ошибка')

In [8]:
#pd.set_option('display.max_rows', None)

Применим ProfileReport для изучения датасета

In [9]:
#profile = ProfileReport(df_train)
#profile

Удалим столбец 'id'

In [10]:
df_train = df_train.drop('id', axis = 1)

Функция для удаления выбросов

In [11]:
def remove_outliers(df, column_name, lb, ub, imp):

    q1 = df[column_name].quantile(lb)
    q3 = df[column_name].quantile(ub)
    iqr = q3 - q1
    
    lower_bound = q1 - (imp * iqr)
    upper_bound = q3 + (imp * iqr)
    
    df_cleaned = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]
    
    return df_cleaned, lower_bound, upper_bound

Параметры функции подбирались индивидуального для каждого столбца, так чтобы на валидационной выборке был максимальный показатель roc_auc

In [12]:
df_train, lb_height, ub_height = remove_outliers(df_train, 'height', 0.25, 0.75, 3)
df_train, lb_weight, ub_weight = remove_outliers(df_train, 'weight', 0.25, 0.75, 2)

Обработаем значения ap_hi  и ap_lo

In [13]:
df_train.loc[df_train['ap_hi'] < 0, 'ap_hi']  = -df_train['ap_hi']
df_train.loc[df_train['ap_lo'] < 0, 'ap_lo']  = -df_train['ap_lo']

df_train['dif'] = df_train['ap_hi'] - df_train['ap_lo']

df_train.loc[(df_train['ap_lo'] >= 1000) & (df_train['ap_lo'] <2000) & (df_train['dif'] < 0), 'ap_lo']  = (df_train['ap_lo'] / 10).round(0)
df_train.loc[(df_train['ap_lo'] >= 5000) & (df_train['ap_lo'] <10000) & (df_train['dif'] < 0), 'ap_lo']  = (df_train['ap_lo'] / 100).round(0)
df_train.loc[(df_train['ap_lo'] >= 500) & (df_train['ap_lo'] <1000) & (df_train['dif'] < 0), 'ap_lo']  = (df_train['ap_lo'] / 10).round(0)
df_train.loc[(df_train['ap_hi'] >= 10) & (df_train['ap_hi'] <20) & (df_train['dif'] < 0), 'ap_hi']  = (df_train['ap_hi'] * 10).round(0)
df_train.loc[(df_train['ap_hi'] == 20) & (df_train['ap_hi'] == 20) & (df_train['dif'] < 0), 'ap_hi']  = (df_train['ap_hi'] * 6).round(0)
df_train.loc[(df_train['ap_hi'] >= 10000), 'ap_hi']  = (df_train['ap_hi'] / 100).round(0)
df_train.loc[(df_train['ap_hi'] >= 1000), 'ap_hi']  = (df_train['ap_hi'] / 10).round(0)
df_train.loc[(df_train['ap_hi'] >= 700), 'ap_hi']  = (df_train['ap_hi'] / 10).round(0)
df_train.loc[(df_train['ap_hi'] < 60), 'ap_hi']  = (df_train['ap_hi'] * 10).round(0)
df_train.loc[(df_train['ap_lo'] == 1), 'ap_lo']  = (df_train['ap_lo'] * 100).round(0)
df_train.loc[(df_train['ap_lo'] <= 10), 'ap_lo']  = (df_train['ap_lo'] * 10).round(0)
df_train.loc[(df_train['ap_lo'] >= 10000), 'ap_lo']  = (df_train['ap_lo'] / 100).round(0)
df_train.loc[(df_train['ap_hi'] == 10), 'ap_hi']  = (df_train['ap_hi'] * 10).round(0)
df_train['dif'] = df_train['ap_hi'] - df_train['ap_lo']
df_train.loc[(df_train['dif'] <0), 'ap_hi'], df_train.loc[(df_train['dif'] <0), 'ap_lo'] = df_train.loc[(df_train['dif'] <0), 'ap_lo'], df_train.loc[(df_train['dif'] <0), 'ap_hi']

df_train = df_train.drop('dif', axis = 1)

Удалим выбросы

In [14]:
df_train, lb_aphi, ub_aphi = remove_outliers(df_train, 'ap_hi', 0.2, 0.75, 3)
df_train, lb_aplo, ub_aplo = remove_outliers(df_train, 'ap_lo', 0.25, 0.75, 2)

Запомним средние значения для того, чтобы на них заменить выбросы в тестовой выборки

In [15]:
mean_weight = df_train['weight'].mean()
mean_height = df_train['height'].mean()
mean_aphi = df_train['ap_hi'].mean()
mean_aplo = df_train['ap_lo'].mean()

Была предпринята попытка использовать фичи, на валидационной выборки не удалось достичь хорошего результата

In [16]:
#df_train['imt'] = df_train['weight'] / (df_train['height'] * df_train['height'] / 10000)
#df_train['aphi_calc'] = 109 + 0.5 * df_train['age']/365 + 0.1 * df_train['weight'] - df_train['ap_hi'] 
#df_train['aplo_calc'] = 63 + 0.1 * df_train['age']/365 + 0.15 * df_train['weight'] - df_train['ap_lo'] 
#df_train = df_train.drop('ap_hi', axis = 1)
#df_train = df_train.drop('ap_lo', axis = 1)
#df_train = df_train.drop('age', axis = 1)
#df_train = df_train.drop('weight', axis = 1)
#df_train = df_train.drop('height', axis = 1)

In [17]:
features_train = df_train.drop(['cardio'], axis=1)
target_train = df_train['cardio']

In [18]:
features_train, features_valid, target_train, target_valid = train_test_split(
    features_train, target_train, test_size=0.25, random_state=12345, stratify=target_train)

Была предпринята попытка применить скалирование и метод главных компонент, но на валидационной выборке это не дало хорошего результата

In [19]:
#scaler = StandardScaler()
#cols_to_scale = ['age', 'imt', 'ap_lo', 'ap_hi']
#features_train_pca = features_train
#features_valid_pca = features_valid
#features_train_pca[cols_to_scale] = scaler.fit_transform(features_train_pca[cols_to_scale])
#features_valid_pca[cols_to_scale] = scaler.transform(features_valid_pca[cols_to_scale])

#pca = PCA(n_components=8, random_state=12345)
#pca.fit(features_train_pca)
#features_train_pca = pca.transform(features_train_pca)
#features_valid_pca = pca.transform(features_valid_pca)

С помощью RandomizedSearchCV и GreasSearch были найдены наилучшие параметры

In [20]:
#param_dist = {
#    "n_estimators": range(1,1001,50),
#    "max_depth": range(1,101,10),
#    "min_samples_leaf": range(1,101,10),
#    "random_state": [12345],
#    "min_samples_split": [2],
#    "max_features": ['sqrt'],
#    "class_weight": ['balanced'],
#    "bootstrap": [True],
#    "criterion": ['entropy']
#}

#rfc = RandomForestClassifier()
#random_search = RandomizedSearchCV(rfc, param_distributions=param_dist, n_iter=25, cv=3, n_jobs=-1, scoring='roc_auc', random_state=12345, refit='roc_auc')

#random_search.fit(features_train, target_train)
#y_pred = random_search.predict_proba(features_valid)[:, 1]

#roc_auc = roc_auc_score(target_valid, y_pred)
#print("ROC AUC на валидации:",roc_auc)
#print("Лучшие гиперпараметры:", random_search.best_params_)

In [21]:
#rfc = RandomForestClassifier()

# Задаем диапазон значений параметров для поиска
#param_grid = {
#    'criterion': ['entropy'],
#    'n_estimators': [851],
#    'max_depth': range(35,47,2),
#    'min_samples_split': [2],
#    'min_samples_leaf':  [45],
#    'max_features': ['sqrt'],
#    'bootstrap': [True],
#    'max_leaf_nodes': [2,10,20],
#    'class_weight': ['balanced', None],
#    'random_state': [12345]
#}

# Выполняем поиск наилучших параметров
#grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='roc_auc')
       
#grid_search.fit(features_train, target_train)

#print("Best parameters: ", grid_search.best_params_)
#print("Test score: ", grid_search.score(features_valid, target_valid))

Вручную доопределим наилучшие параметры и подтвердим их на валидационной выборке

In [22]:
rfc_model = RandomForestClassifier(
    n_estimators = 401,
    max_depth= 21,
    min_samples_leaf= 41,
    random_state= 12345,
    min_samples_split= 2,
    max_features= 'sqrt',
    class_weight= 'balanced',
    bootstrap = True,
    criterion = 'entropy'
)
rfc_model.fit(features_train,target_train)

#result = cross_val_score(rfc_model, features_train, target_train, cv=5, scoring='roc_auc').mean() 
#result
roc_auc_score(target_valid, rfc_model.predict_proba(features_valid)[:,1])


0.8090073983535903

Сохраним модель, чтобы затем исопльзовать для streamlit

In [33]:
#with open("model.plc", "wb") as fid:
#    dump(rfc_model, fid)

Обработаем тестовые данные

In [23]:
try:
    df_test = pd.read_csv('test.csv')
except:
    print('При чтение файла возникал ошибка')

Функция замены выборосов в тестовой выборки на средние значения, полученные на обучающей выборке

In [24]:
def replace_outliers_with_mean(df, column_name, lower_bound, upper_bound, mean):
   
    df_cleaned = df.copy()
    
    outliers = (df_cleaned[column_name] < lower_bound) | (df_cleaned[column_name] > upper_bound)
    
    df_cleaned.loc[outliers, column_name] = mean
    return df_cleaned

In [25]:
df_test = replace_outliers_with_mean(df_test, 'height', lb_height, ub_height, mean_height)
df_test = replace_outliers_with_mean(df_test, 'weight', lb_weight, ub_height, mean_height)

In [26]:
df_test.loc[df_test['ap_hi'] < 0, 'ap_hi']  = -df_test['ap_hi']
df_test.loc[df_test['ap_lo'] < 0, 'ap_lo']  = -df_test['ap_lo']
df_test['dif'] = df_test['ap_hi'] - df_test['ap_lo']
df_test.loc[(df_test['ap_lo'] >= 1000) & (df_test['ap_lo'] <2000) & (df_test['dif'] < 0), 'ap_lo']  = (df_test['ap_lo'] / 10).round(0)
df_test.loc[(df_test['ap_lo'] >= 5000) & (df_test['ap_lo'] <10000) & (df_test['dif'] < 0), 'ap_lo']  = (df_test['ap_lo'] / 100).round(0)
df_test.loc[(df_test['ap_lo'] >= 500) & (df_test['ap_lo'] <1000) & (df_test['dif'] < 0), 'ap_lo']  = (df_test['ap_lo'] / 10).round(0)
df_test.loc[(df_test['ap_hi'] >= 10) & (df_test['ap_hi'] <20) & (df_test['dif'] < 0), 'ap_hi']  = (df_test['ap_hi'] * 10).round(0)
df_test.loc[(df_test['ap_hi'] == 20) & (df_test['ap_hi'] == 20) & (df_test['dif'] < 0), 'ap_hi']  = (df_test['ap_hi'] * 6).round(0)
df_test.loc[(df_test['ap_hi'] >= 10000), 'ap_hi']  = (df_test['ap_hi'] / 100).round(0)
df_test.loc[(df_test['ap_hi'] >= 1000), 'ap_hi']  = (df_test['ap_hi'] / 10).round(0)
df_test.loc[(df_test['ap_hi'] >= 700), 'ap_hi']  = (df_test['ap_hi'] / 10).round(0)
df_test.loc[(df_test['ap_hi'] < 60), 'ap_hi']  = (df_test['ap_hi'] * 10).round(0)
df_test.loc[(df_test['ap_lo'] == 1), 'ap_lo']  = (df_test['ap_lo'] * 100).round(0)
df_test.loc[(df_test['ap_lo'] <= 10), 'ap_lo']  = (df_test['ap_lo'] * 10).round(0)
df_test.loc[(df_test['ap_lo'] >= 10000), 'ap_lo']  = (df_test['ap_lo'] / 100).round(0)
df_test.loc[(df_test['ap_hi'] == 10), 'ap_hi']  = (df_test['ap_hi'] * 10).round(0)
df_test['dif'] = df_test['ap_hi'] - df_test['ap_lo']
df_test.loc[(df_test['dif'] <0), 'ap_hi'], df_test.loc[(df_test['dif'] <0), 'ap_lo'] = df_test.loc[(df_test['dif'] <0), 'ap_lo'], df_test.loc[(df_test['dif'] <0), 'ap_hi']
df_test = df_test.drop('dif', axis = 1)

In [27]:
df_test = replace_outliers_with_mean(df_test, 'ap_hi', lb_aphi, ub_aphi, mean_aphi)
df_test = replace_outliers_with_mean(df_test, 'ap_lo', lb_aplo, ub_aplo, mean_aplo)

In [28]:
features_test = df_test.drop('id', axis = 1)

In [29]:
predictions = rfc_model.predict(features_test)
probabilities_valid = rfc_model.predict_proba(features_test)
probabilities_one_valid = probabilities_valid[:, 1]

In [30]:
result = pd.DataFrame({'id': df_test['id'],
                   'cardio': probabilities_one_valid })

In [31]:
result.to_csv('sample_submission.csv', index=False)