## Чтение данных

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('heart_2022_no_nans.csv')
df.head()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.6,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.7,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No


Обработка данных такая же, как в ноутбуке с отбором фичей, не будем заострять внимание на этом

In [2]:
df = df.drop_duplicates()
df.duplicated().any()

False

In [3]:
df = df.drop(
    columns=[
        'State',
        'PhysicalHealthDays', 
        'MentalHealthDays'
    ]
)

In [4]:
df['GeneralHealth'] = df['GeneralHealth'].map({
    "Poor" : 0,
    "Good" : 1,
    "Very good" : 2,
    "Fair" : 3,
    "Excellent" : 4
})

In [5]:
df['LastCheckupTime'] = df['LastCheckupTime'].map({
    'Within past year (anytime less than 12 months ago)' : 0,
    'Within past 2 years (1 year but less than 2 years ago)' : 1,
    'Within past 5 years (2 years but less than 5 years ago)' : 2,
    '5 or more years ago' : 3
})

In [6]:
df['RemovedTeeth'] = df['RemovedTeeth'].map({
    'None of them' : 0,
    '1 to 5' : 1,
    '6 or more, but not all' : 2,
    'All' : 3
})

In [7]:
df['AgeCategory'] = df['AgeCategory'].map({
    cat : i for i, cat in enumerate(sorted(df['AgeCategory']))
})

In [8]:
numeric_cols = df.select_dtypes(include=['int', 'float']).columns
for col in numeric_cols:
    df[col] = (df[col] - df[col].mean()) / df[col].std()
df.head()

Unnamed: 0,Sex,GeneralHealth,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,HadAngina,HadStroke,HadAsthma,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Female,-0.063069,-0.437113,Yes,1.373423,-0.781148,No,No,No,No,...,-0.985901,-0.560212,-0.104123,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Male,-0.063069,-0.437113,Yes,-0.708901,-0.781148,No,No,No,No,...,0.701798,0.545625,0.2244,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Male,-0.063069,-0.437113,No,0.679315,1.507534,No,No,No,No,...,1.358125,1.183895,0.459278,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Female,0.826852,-0.437113,Yes,1.373423,-0.781148,No,No,No,No,...,-0.04829,0.33318,0.407083,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Female,-0.95299,-0.437113,Yes,-1.403009,0.363193,No,No,No,No,...,-1.454706,-0.198634,0.675735,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [9]:
df = pd.get_dummies(
    df, 
    columns=[
        'ECigaretteUsage',
        'SmokerStatus',
        'RaceEthnicityCategory',
        'TetanusLast10Tdap'
    ], 
    drop_first=True,
    dtype=float
)

In [10]:
for column in df.columns:
    if sorted(df[column].unique()) == ["No", "Yes"]:
        df[column] = df[column].map({
            "Yes" : 1,
            "No" : 0
        })
df['Sex'] = df['Sex'].map({
            "Male" : 1,
            "Female" : 0
        }) 
df['HadDiabetes'] = df['HadDiabetes'].map({
    'No' : 0, 
    'Yes' : 1,
    'Yes, but only during pregnancy (female)' : 1,
    'No, pre-diabetes or borderline diabetes' : 0
})
df.head()

Unnamed: 0,Sex,GeneralHealth,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,HadAngina,HadStroke,HadAsthma,...,SmokerStatus_Current smoker - now smokes some days,SmokerStatus_Former smoker,SmokerStatus_Never smoked,RaceEthnicityCategory_Hispanic,"RaceEthnicityCategory_Multiracial, Non-Hispanic","RaceEthnicityCategory_Other race only, Non-Hispanic","RaceEthnicityCategory_White only, Non-Hispanic","TetanusLast10Tdap_Yes, received Tdap","TetanusLast10Tdap_Yes, received tetanus shot but not sure what type","TetanusLast10Tdap_Yes, received tetanus shot, but not Tdap"
0,0,-0.063069,-0.437113,1,1.373423,-0.781148,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1,-0.063069,-0.437113,1,-0.708901,-0.781148,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,1,-0.063069,-0.437113,0,0.679315,1.507534,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0,0.826852,-0.437113,1,1.373423,-0.781148,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,-0.95299,-0.437113,1,-1.403009,0.363193,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [11]:
df['CovidPos'] = df['CovidPos'].map({
    "Yes" : 1,
    "No" : 0,
    "Tested positive using home test without a health professional" : 1
})

Оставим отобранные фичи

In [12]:
import numpy as np
selected_features_catboost = np.array(['Sex', 'GeneralHealth', 'SleepHours', 'RemovedTeeth', 'HadAngina',
       'HadStroke', 'DifficultyWalking', 'ChestScan', 'AgeCategory',
       'HeightInMeters', 'WeightInKilograms', 'BMI'], dtype=object)
selected_features_rf = np.array(['GeneralHealth', 'SleepHours', 'RemovedTeeth', 'HadAngina',
       'HadStroke', 'HadDiabetes', 'DifficultyWalking', 'ChestScan',
       'AgeCategory', 'HeightInMeters', 'WeightInKilograms', 'BMI'],
      dtype=object)
selected_features_lasso = np.array(['Sex', 'HadAngina', 'HadStroke', 'HadDiabetes',
       'DifficultyWalking', 'ChestScan', 'AgeCategory',
       'SmokerStatus_Former smoker', 'SmokerStatus_Never smoked',
       'RaceEthnicityCategory_Hispanic',
       'RaceEthnicityCategory_Multiracial, Non-Hispanic',
       'RaceEthnicityCategory_Other race only, Non-Hispanic'],
      dtype=object)

In [13]:
from copy import deepcopy
df_copy = deepcopy(df)

Обрежем датасет, чтобы избавиться от дисбаланса классов в данных для выбора лучшей модели

In [14]:
condition0 = (df['HadHeartAttack'] == 0)
condition1 = (df['HadHeartAttack'] == 1)
rows_to_drop = len(df[condition0]) - len(df[condition1])
indices_to_drop = df[condition0].sample(n=rows_to_drop).index
df = df.drop(indices_to_drop)

In [15]:
X = df.drop(columns=['HadHeartAttack'])
y = df['HadHeartAttack']

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
logreg = LogisticRegression(max_iter=10000)
logreg_fits = cross_val_score(
    logreg, 
    X[selected_features_lasso], 
    y, 
    scoring='f1', 
    cv=100
)

In [17]:
from catboost import CatBoostClassifier
cb = CatBoostClassifier(verbose=0)
cb_fits = cross_val_score(
    cb, 
    X[selected_features_catboost], 
    y, 
    scoring='f1', 
    cv=100
)

In [18]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_jobs=-1)
clf_fits = cross_val_score(
    clf, 
    X[selected_features_rf],
    y, 
    scoring='f1', 
    cv=100
)

In [19]:
from scipy.stats import ttest_ind
models_results = [logreg_fits, cb_fits, clf_fits]
models = ['LogReg', "CatBoost", "Random forest"]
for i in range(3):
    for j in range(i + 1, 3): 
        pvalue = ttest_ind(models_results[i], models_results[j], alternative='greater', equal_var=False).pvalue
        if pvalue < 0.05:
            print(models[i], ">", models[j], "pvalue:", pvalue)
        else:
            print(models[i], "<", models[j], "pvalue:", pvalue)

LogReg < CatBoost pvalue: 0.9795034036977411
LogReg < Random forest pvalue: 0.24416313288902153
CatBoost > Random forest pvalue: 0.003030180625097528


Catboost лучше всех. Будем подбирать параметры для негo

In [22]:
from sklearn.model_selection import GridSearchCV
params = {
    'iterations': [1000, 2000, 5000],
    'loss_function': ['Logloss', 'CrossEntropy'],
    'learning_rate': [1e-2, 1e-3, 1e-4]
}
grid = GridSearchCV(CatBoostClassifier(verbose=0), params, n_jobs=-1, scoring='f1', cv=3)

In [23]:
grid.fit(X[selected_features_catboost], y)

Вот и лучшая модель. Наконец, искусственно увеличим датасет и сделаем предсказание лучшей моделью на всем множестве 

In [24]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(
    sampling_strategy='auto',  
    random_state=None,        
    k_neighbors=30,    
    n_jobs=-1
)

In [30]:
X, y = smote.fit_resample(
    df_copy[selected_features_catboost], 
    df_copy['HadHeartAttack']
)

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.05,
    stratify=y
)

In [32]:
grid.best_params_

{'iterations': 1000, 'learning_rate': 0.01, 'loss_function': 'Logloss'}

In [33]:
clf = grid.best_estimator_
clf.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x74b241393d50>

In [34]:
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.94      0.97      0.96     11629
           1       0.97      0.94      0.96     11629

    accuracy                           0.96     23258
   macro avg       0.96      0.96      0.96     23258
weighted avg       0.96      0.96      0.96     23258

