In [836]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [837]:
df = pd.read_csv("NHANES_age_prediction.csv")
df.sample(5)

Unnamed: 0,SEQN,age_group,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
1917,82116.0,Adult,18.0,2.0,2.0,46.7,97.0,2.0,155.0,60.07
1089,78514.0,Adult,45.0,1.0,2.0,23.3,90.0,2.0,65.0,4.71
1096,78551.0,Adult,59.0,2.0,1.0,23.1,91.0,2.0,71.0,2.55
1063,78404.0,Adult,44.0,1.0,2.0,34.5,98.0,2.0,84.0,13.42
1969,82345.0,Adult,32.0,1.0,2.0,31.0,100.0,2.0,112.0,19.48


In [838]:
df.rename(columns={"SEQN": "resp_id", "RIDAGEYR": "age",
                   "RIAGENDR": "gender", "PAQ605": "sport_weekly",
                   "BMXBMI": "BMI", "LBXGLU": "glucose",
                   "DIQ010": "diabetic", "LBXGLT": "glucose_tolerance",
                   "LBXIN": "insulin"}, inplace=True)

In [839]:
df.sample(5)

Unnamed: 0,resp_id,age_group,age,gender,sport_weekly,BMI,glucose,diabetic,glucose_tolerance,insulin
498,75866.0,Adult,12.0,1.0,2.0,20.1,103.0,2.0,76.0,7.8
1795,81579.0,Senior,73.0,2.0,2.0,52.2,110.0,2.0,190.0,22.51
1431,80119.0,Senior,80.0,2.0,2.0,33.5,118.0,2.0,130.0,6.83
1477,80278.0,Adult,47.0,1.0,2.0,33.5,101.0,2.0,64.0,8.74
1803,81606.0,Adult,42.0,2.0,2.0,20.3,94.0,2.0,132.0,8.14


* resp_id	- порядковый номер респондента
* age_group	- таргет - возрастная группа: взрослый(<65) / пожилой(>=65)
* age	- возраст
* gender - гендер
* sport_weekly - занимается ли респондент спортом, фитнесом или пр. мероприятиями обычно в течение недели
* BMI	- индекс массы тела
* glucose	- уровень глюкозы в крови натощак
* diabetic - есть ли у респондента диабет
* glucose_tolerance	- результаты теста на толерантность к глюкозе (OGTT)
* insulin - уровень инсулина в крови

Задача: определить возрастную категорию age_group респондента  (взрослый/пожилой) на основании всех признаков, кроме age.

In [840]:
df['age_group'] = df['age_group'].apply(lambda x: 1 if 'Senior' in x else 0)
df['sport_weekly'] = df['sport_weekly'].apply(lambda x: 2 if x>2 else x)
df['diabetic'] = df['diabetic'].apply(lambda x: np.random.choice([1, 2]) if x==3.0 else x)

In [841]:
df['gender'] = df['gender'].astype(int)
df['sport_weekly'] = df['sport_weekly'].astype(int)
df['glucose'] = df['glucose'].astype(int)
df['diabetic'] = df['diabetic'].astype(int)
df['glucose_tolerance'] = df['glucose_tolerance'].astype(int)

In [842]:
df.sample(5)

Unnamed: 0,resp_id,age_group,age,gender,sport_weekly,BMI,glucose,diabetic,glucose_tolerance,insulin
1617,80816.0,0,31.0,2,2,26.0,99,2,77,5.65
2051,82675.0,1,71.0,1,1,23.2,102,2,165,20.98
990,78015.0,0,47.0,1,1,30.4,100,2,117,13.62
1202,79044.0,0,31.0,1,2,27.1,109,2,122,14.91
2217,83440.0,0,13.0,2,2,19.7,92,2,153,11.99


In [843]:
df.shape

(2278, 10)

In [844]:
X = df.drop(['age_group', 'resp_id', 'age'], axis=1)
y = df['age_group']

In [845]:
y.value_counts()

age_group
0    1914
1     364
Name: count, dtype: int64

In [846]:
smote = SMOTE(sampling_strategy='auto', random_state=33)

X, y = smote.fit_resample(X, y)

print(X_train.shape)
print(y_train.shape)

(2871, 7)
(2871,)


In [847]:
y.value_counts()

age_group
0    1914
1    1914
Name: count, dtype: int64

In [848]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33, stratify=y)
#save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)
#save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [849]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2871, 7), (2871,), (957, 7), (957,))

In [850]:
X_train.sample(5)

Unnamed: 0,gender,sport_weekly,BMI,glucose,diabetic,glucose_tolerance,insulin
3763,2,2,43.517015,97,2,90,7.608249
287,2,2,28.9,99,2,92,11.85
3741,1,2,24.409663,103,2,151,4.250483
1437,2,2,35.1,105,2,224,15.12
1614,1,2,27.8,101,2,84,9.49


In [851]:
class BinaryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.key] = X_transformed[self.key].replace({1: 1, 2: 0})
        
        return X_transformed

In [852]:
continuos_cols = ['BMI', 'glucose', 'glucose_tolerance', 'insulin']
cat_cols = ['gender', 'sport_weekly', 'diabetic']

In [853]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('binenc', BinaryEncoder(key=cat_col))  
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuos_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

In [854]:
%%time

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=33))
])

pipeline.fit(X_train, y_train)

CPU times: total: 500 ms
Wall time: 1.71 s


In [855]:
pipeline.steps

[('preprocessor',
  ColumnTransformer(transformers=[('num',
                                   Pipeline(steps=[('scaler', StandardScaler())]),
                                   ['BMI', 'glucose', 'glucose_tolerance',
                                    'insulin']),
                                  ('cat',
                                   Pipeline(steps=[('binenc',
                                                    BinaryEncoder(key='diabetic'))]),
                                   ['gender', 'sport_weekly', 'diabetic'])])),
 ('classifier', RandomForestClassifier(random_state=33))]

In [856]:
y_pred = pipeline.predict(X_test)

In [857]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Точность: {precision:.2f}")
print(f"Полнота: {recall:.2f}")
print(f"F1-мера: {f1:.2f}")

Точность: 0.80
Полнота: 0.89
F1-мера: 0.84


In [858]:
preds = pipeline.predict_proba(X_test)

In [859]:
precision, recall, thresholds = precision_recall_curve(y_test, preds[:, 1])

In [860]:
thresholds

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ])

In [861]:
precision, recall

(array([0.49947753, 0.50105042, 0.50583245, 0.51016043, 0.51626898,
        0.52422907, 0.53243848, 0.53907135, 0.54275941, 0.5483871 ,
        0.55477855, 0.56198347, 0.56666667, 0.5769697 , 0.58983891,
        0.59649123, 0.6043257 , 0.614489  , 0.6225426 , 0.625     ,
        0.62997347, 0.63624161, 0.64489796, 0.6510989 , 0.6565097 ,
        0.65877437, 0.66619718, 0.671875  , 0.67862267, 0.6875    ,
        0.69061584, 0.70044709, 0.70526316, 0.71036585, 0.71604938,
        0.721875  , 0.72555205, 0.73248408, 0.73717949, 0.74267101,
        0.7442623 , 0.75      , 0.75250836, 0.75762712, 0.7619863 ,
        0.76909722, 0.77601411, 0.78456014, 0.78832117, 0.79373849,
        0.80260708, 0.80492424, 0.81467181, 0.81871345, 0.8313253 ,
        0.83469388, 0.84090909, 0.84486373, 0.84796574, 0.85185185,
        0.85682819, 0.8590604 , 0.8652968 , 0.86666667, 0.87209302,
        0.87264151, 0.8804878 , 0.88118812, 0.88916877, 0.89405685,
        0.89417989, 0.899729  , 0.90730337, 0.90

In [862]:
precision = precision
recall = recall

table = pd.DataFrame({'thresholds': thresholds, 'precision': precision[:-1], 'recall': recall[:-1]})

In [867]:
thresh=0.843
table[(table['precision'] > thresh) & (table['recall'] > thresh)]

Unnamed: 0,thresholds,precision,recall
57,0.57,0.844864,0.843096


In [872]:
threshold = 0.56

In [873]:
preds = pipeline.predict_proba(X_test)[:, 1]

In [874]:
y_pred = [1 if x > threshold else 0 for x in preds]

In [875]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Точность: {precision:.2f}")
print(f"Полнота: {recall:.2f}")
print(f"F1-мера: {f1:.2f}")

Точность: 0.84
Полнота: 0.84
F1-мера: 0.84


In [876]:
with open("pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)