In [1]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("NHANES_age_prediction.csv")
df.sample(5)

Unnamed: 0,SEQN,age_group,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
199,74459.0,Adult,61.0,2.0,2.0,43.3,110.0,2.0,121.0,18.35
385,75349.0,Adult,54.0,1.0,2.0,24.5,98.0,2.0,81.0,15.2
506,75890.0,Adult,43.0,2.0,2.0,31.3,99.0,2.0,110.0,20.14
132,74191.0,Adult,39.0,2.0,1.0,27.3,91.0,2.0,90.0,4.62
1682,81101.0,Adult,38.0,2.0,2.0,30.7,100.0,2.0,161.0,19.12


In [3]:
df.rename(columns={"SEQN": "resp_id", "RIDAGEYR": "age",
                   "RIAGENDR": "gender", "PAQ605": "sport_weekly",
                   "BMXBMI": "BMI", "LBXGLU": "glucose",
                   "DIQ010": "diabetic", "LBXGLT": "glucose_tolerance",
                   "LBXIN": "insulin"}, inplace=True)

In [4]:
df.sample(5)

Unnamed: 0,resp_id,age_group,age,gender,sport_weekly,BMI,glucose,diabetic,glucose_tolerance,insulin
1497,80356.0,Adult,12.0,1.0,2.0,19.9,98.0,2.0,97.0,11.32
1044,78305.0,Adult,43.0,2.0,2.0,32.5,92.0,2.0,83.0,12.31
1112,78634.0,Adult,55.0,2.0,2.0,48.4,108.0,2.0,183.0,29.8
1857,81883.0,Adult,19.0,1.0,2.0,29.1,92.0,2.0,92.0,11.97
1192,79000.0,Adult,36.0,1.0,1.0,26.8,122.0,2.0,120.0,15.27


* resp_id	- порядковый номер респондента
* age_group	- таргет - возрастная группа: взрослый(<65) / пожилой(>=65)
* age	- возраст
* gender - гендер
* sport_weekly - занимается ли респондент спортом, фитнесом или пр. мероприятиями обычно в течение недели
* BMI	- индекс массы тела
* glucose	- уровень глюкозы в крови натощак
* diabetic - есть ли у респондента диабет
* glucose_tolerance	- результаты теста на толерантность к глюкозе (OGTT)
* insulin - уровень инсулина в крови

Задача: определить возрастную категорию age_group респондента  (взрослый/пожилой) на основании всех признаков, кроме age.

In [5]:
df['age_group'] = df['age_group'].apply(lambda x: 1 if 'Senior' in x else 0)
df['sport_weekly'] = df['sport_weekly'].apply(lambda x: 2 if x>2 else x)
df['diabetic'] = df['diabetic'].apply(lambda x: np.random.choice([1, 2]) if x==3.0 else x)

In [6]:
df['gender'] = df['gender'].astype(int)
df['sport_weekly'] = df['sport_weekly'].astype(int)
df['glucose'] = df['glucose'].astype(int)
df['diabetic'] = df['diabetic'].astype(int)
df['glucose_tolerance'] = df['glucose_tolerance'].astype(int)

In [7]:
df.sample(5)

Unnamed: 0,resp_id,age_group,age,gender,sport_weekly,BMI,glucose,diabetic,glucose_tolerance,insulin
2253,83624.0,0,50.0,2,2,34.3,333,2,510,21.87
162,74304.0,1,80.0,1,2,31.4,125,1,175,13.59
397,75393.0,0,30.0,2,1,31.8,100,2,106,7.73
13,73633.0,0,43.0,2,2,30.5,102,2,124,12.06
941,77788.0,0,21.0,2,2,39.5,99,2,96,25.96


In [8]:
df.shape

(2278, 10)

In [9]:
X = df.drop(['age_group', 'resp_id', 'age'], axis=1)
y = df['age_group']

In [10]:
y.value_counts()

age_group
0    1914
1     364
Name: count, dtype: int64

In [11]:
smote = SMOTE(sampling_strategy='auto', random_state=33)

X, y = smote.fit_resample(X, y)

print(X.shape)
print(y.shape)

(3828, 7)
(3828,)


In [12]:
y.value_counts()

age_group
0    1914
1    1914
Name: count, dtype: int64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33, stratify=y)
#save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)
#save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [14]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2871, 7), (2871,), (957, 7), (957,))

In [15]:
X_train.sample(5)

Unnamed: 0,gender,sport_weekly,BMI,glucose,diabetic,glucose_tolerance,insulin
3432,1,2,25.544553,92,2,115,15.387651
514,1,2,25.0,99,2,134,6.89
2186,2,2,21.4,85,2,101,3.41
1066,1,2,27.7,100,2,83,13.36
598,2,2,38.9,102,2,113,12.19


In [16]:
class BinaryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.key] = X_transformed[self.key].replace({1: 1, 2: 0})

        return X_transformed

In [17]:
continuos_cols = ['BMI', 'glucose', 'glucose_tolerance', 'insulin']
cat_cols = ['gender', 'sport_weekly', 'diabetic']

In [18]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('binenc', BinaryEncoder(key=cat_cols))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuos_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

In [19]:
%%time

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=33))
])

pipeline.fit(X_train, y_train)

CPU times: total: 438 ms
Wall time: 526 ms


In [20]:
pipeline.steps

[('preprocessor',
  ColumnTransformer(transformers=[('num',
                                   Pipeline(steps=[('scaler', StandardScaler())]),
                                   ['BMI', 'glucose', 'glucose_tolerance',
                                    'insulin']),
                                  ('cat',
                                   Pipeline(steps=[('binenc',
                                                    BinaryEncoder(key=['gender',
                                                                       'sport_weekly',
                                                                       'diabetic']))]),
                                   ['gender', 'sport_weekly', 'diabetic'])])),
 ('classifier', RandomForestClassifier(random_state=33))]

In [21]:
y_pred = pipeline.predict(X_test)

In [22]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Точность: {precision:.2f}")
print(f"Полнота: {recall:.2f}")
print(f"F1-мера: {f1:.2f}")

Точность: 0.80
Полнота: 0.90
F1-мера: 0.84


In [23]:
preds = pipeline.predict_proba(X_test)

In [24]:
precision, recall, thresholds = precision_recall_curve(y_test, preds[:, 1])

In [25]:
thresholds

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ])

In [26]:
precision, recall

(array([0.49947753, 0.5       , 0.5047619 , 0.50909091, 0.51459459,
        0.52078775, 0.52654867, 0.53724605, 0.54214123, 0.54587156,
        0.55284553, 0.55934195, 0.56599287, 0.57211538, 0.57978076,
        0.59057072, 0.59449312, 0.60355781, 0.61053985, 0.61688312,
        0.62582345, 0.63333333, 0.64016173, 0.64842681, 0.65469613,
        0.66108787, 0.66573034, 0.67283073, 0.67862267, 0.68459302,
        0.69378698, 0.7027027 , 0.70481928, 0.71016692, 0.71559633,
        0.72093023, 0.72161742, 0.72913386, 0.7392    , 0.747557  ,
        0.75164474, 0.75959933, 0.76134454, 0.76360544, 0.76632302,
        0.76856649, 0.7754386 , 0.77797513, 0.78519856, 0.79341865,
        0.7948244 , 0.79850746, 0.80265655, 0.8065764 , 0.81782178,
        0.8249497 , 0.83744856, 0.83991684, 0.84388186, 0.84913793,
        0.85434783, 0.85714286, 0.86621315, 0.86866359, 0.87645688,
        0.88207547, 0.89563107, 0.8962963 , 0.9028133 , 0.90649351,
        0.91466667, 0.91553134, 0.91412742, 0.92

In [27]:
precision = precision
recall = recall

table = pd.DataFrame({'thresholds': thresholds, 'precision': precision[:-1], 'recall': recall[:-1]})

In [28]:
thresh=0.843
table[(table['precision'] > thresh) & (table['recall'] > thresh)]

Unnamed: 0,thresholds,precision,recall


In [29]:
threshold = 0.56

In [30]:
preds = pipeline.predict_proba(X_test)[:, 1]

In [31]:
y_pred = [1 if x > threshold else 0 for x in preds]

In [32]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Точность: {precision:.2f}")
print(f"Полнота: {recall:.2f}")
print(f"F1-мера: {f1:.2f}")

Точность: 0.84
Полнота: 0.85
F1-мера: 0.84


In [33]:
with open("pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)