Обучение базовой модели

In [1]:
from sqlalchemy import create_engine
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv

def create_connection():

    load_dotenv()
    host = os.environ.get('DB_DESTINATION_HOST')
    port = os.environ.get('DB_DESTINATION_PORT')
    db = os.environ.get('DB_DESTINATION_NAME')
    username = os.environ.get('DB_DESTINATION_USER')
    password = os.environ.get('DB_DESTINATION_PASSWORD')
    
    print(f'postgresql://{username}:{password}@{host}:{port}/{db}')
    conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}')
    return conn

# устанавливаем соединение с базой
conn = create_connection()
data = pd.read_sql('select * from clean_users_churn', conn)
data.head() 

postgresql://mle_20240729_393dbfd5ab:2b9f7f47f6f949a0b0ec2303a63cc76e@rc1b-uh7kdmcx67eomesf.mdb.yandexcloud.net:6432/playground_mle_20240729_393dbfd5ab


Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,13,8091-TTVAX,2015-04-01,NaT,One year,No,Credit card (automatic),100.35,5681.1,Fiber optic,...,Yes,No,Yes,Yes,Male,0,Yes,No,Yes,0
1,14,0280-XJGEX,2015-09-01,2019-10-01,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Fiber optic,...,Yes,No,Yes,Yes,Male,0,No,No,Yes,1
2,15,5129-JLPIS,2018-01-01,NaT,Month-to-month,Yes,Electronic check,105.5,2686.05,Fiber optic,...,Yes,Yes,Yes,Yes,Male,0,No,No,No,0
3,17,3655-SNQYZ,2014-05-01,NaT,Two year,No,Credit card (automatic),113.25,7895.15,Fiber optic,...,Yes,Yes,Yes,Yes,Female,0,Yes,Yes,Yes,0
4,19,9959-WOFKT,2014-03-01,NaT,Two year,No,Bank transfer (automatic),106.7,7382.25,Fiber optic,...,Yes,No,Yes,Yes,Male,0,No,Yes,Yes,0


Удалаяем колонки не нужные для обучения

In [2]:
data.drop(columns=['id', 'customer_id', 'begin_date', 'end_date'], inplace=True)

Обрабатываем кагориальные признаки

In [3]:
cat_features = data.select_dtypes(include='object')
potential_binary_features = cat_features.nunique() == 2

binary_cat_features = cat_features[potential_binary_features[potential_binary_features].index]
other_cat_features = cat_features[potential_binary_features[~potential_binary_features].index]
num_features = data.select_dtypes(['float']) 

In [4]:
potential_binary_features[~potential_binary_features].index

Index(['type', 'payment_method'], dtype='object')

Кодируем бинарные признаки

In [5]:
from sklearn.preprocessing import OneHotEncoder

one_hot_drop = OneHotEncoder(drop='if_binary', sparse_output=False)

In [6]:
drop_res = one_hot_drop.fit_transform(binary_cat_features)
drop_res = pd.DataFrame(drop_res, columns=one_hot_drop.get_feature_names_out())
print(drop_res.head())
print('shape: ', drop_res.shape)

   paperless_billing_Yes  internet_service_Fiber optic  online_security_Yes  \
0                    0.0                           1.0                  0.0   
1                    1.0                           1.0                  0.0   
2                    1.0                           1.0                  1.0   
3                    0.0                           1.0                  1.0   
4                    0.0                           1.0                  1.0   

   online_backup_Yes  device_protection_Yes  tech_support_Yes  \
0                0.0                    1.0               0.0   
1                1.0                    1.0               0.0   
2                0.0                    1.0               1.0   
3                1.0                    1.0               1.0   
4                0.0                    1.0               0.0   

   streaming_tv_Yes  streaming_movies_Yes  gender_Male  partner_Yes  \
0               1.0                   1.0          1.0         

Кодируем остальные категориальные признаки

когда категорий у признаков много, пользуются индустриальным стандартом — методом Target Encoding, то есть кодированием с помощью среднего значения целевой переменной в каждой категории. 

In [7]:
from category_encoders import CatBoostEncoder

catboost_enc = CatBoostEncoder()

catboost_enc.fit_transform(other_cat_features, data['target']).head()

Unnamed: 0,type,payment_method
0,0.26537,0.26537
1,0.26537,0.26537
2,0.632685,0.26537
3,0.26537,0.132685
4,0.132685,0.632685


Нормируем числовые признаки

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler_res = scaler.fit_transform(num_features)
print(scaler_res)
print(pd.DataFrame(scaler_res, columns=scaler.get_feature_names_out())) 

[[ 1.18281085  1.50023859]
 [ 1.29415125  1.2155385 ]
 [ 1.35397595  0.17782698]
 ...
 [ 0.05777429  0.19661418]
 [ 0.37683931 -0.0838472 ]
 [ 0.52307745 -0.56140864]]
      monthly_charges  total_charges
0            1.182811       1.500239
1            1.294151       1.215539
2            1.353976       0.177827
3            1.611554       2.477813
4            1.393859       2.251351
...               ...            ...
7038        -1.339797      -0.569643
7039         0.823863       1.204964
7040         0.057774       0.196614
7041         0.376839      -0.083847
7042         0.523077      -0.561409

[7043 rows x 2 columns]


Объединяем трансформации

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from category_encoders import CatBoostEncoder

cat_features = data.select_dtypes(include='object')
potential_binary_features = cat_features.nunique() == 2

binary_cat_features = cat_features[potential_binary_features[potential_binary_features].index]
other_cat_features = cat_features[potential_binary_features[~potential_binary_features].index]
num_features = data.select_dtypes(['float']) 

binary_cols = binary_cat_features.columns.tolist()
non_binary_cat_cols = other_cat_features.columns.tolist()
num_cols = num_features.columns.tolist()

# определите список трансформаций в рамках ColumnTransformer
preprocessor = ColumnTransformer(
	[
        ('one-hot-encoder_binary', OneHotEncoder(drop='if_binary', sparse_output=False), binary_cols),
        ('CatBoostEncoder_nobinary', CatBoostEncoder(), non_binary_cat_cols),
        ('Standart_skalef_num_cols', StandardScaler(), num_cols),
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

# трансформируйте исходные данные data с помощью созданного preprocessor
data_transformed = preprocessor.fit_transform(data,data['target'])
print(data_transformed)
print(pd.DataFrame(data_transformed, columns=preprocessor.get_feature_names_out()))

[[ 0.          1.          0.         ...  0.26536987  1.18281085
   1.50023859]
 [ 1.          1.          0.         ...  0.26536987  1.29415125
   1.2155385 ]
 [ 1.          1.          1.         ...  0.26536987  1.35397595
   0.17782698]
 ...
 [ 1.          0.          0.         ...  0.16662265  0.05777429
   0.19661418]
 [ 1.          1.          0.         ...  0.45315794  0.37683931
  -0.0838472 ]
 [ 1.          1.          0.         ...  0.45296633  0.52307745
  -0.56140864]]
      paperless_billing_Yes  internet_service_Fiber optic  \
0                       0.0                           1.0   
1                       1.0                           1.0   
2                       1.0                           1.0   
3                       0.0                           1.0   
4                       0.0                           1.0   
...                     ...                           ...   
7038                    1.0                           1.0   
7039                

Обучим модель

In [10]:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(
    data,
    data['target'],
    stratify=data['target']) 

In [11]:
# pip install catboost

In [12]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(auto_class_weights='Balanced') 

In [13]:
from sklearn.metrics import f1_score, roc_auc_score

X_tr_prepared = preprocessor.fit_transform(X_tr, y_tr)
model.fit(X_tr_prepared, y_tr)

X_val_prepared = preprocessor.transform(X_val)
y_pred = model.predict(X_val_prepared)

print('f1_score:', f1_score(y_val, y_pred))
print('roc_auc_score:', roc_auc_score(y_val, y_pred)) 

Learning rate set to 0.020969
0:	learn: 0.6845004	total: 48.8ms	remaining: 48.7s
1:	learn: 0.6760530	total: 51.9ms	remaining: 25.9s
2:	learn: 0.6675178	total: 54.5ms	remaining: 18.1s
3:	learn: 0.6588226	total: 56.9ms	remaining: 14.2s
4:	learn: 0.6513258	total: 59.4ms	remaining: 11.8s
5:	learn: 0.6444349	total: 62ms	remaining: 10.3s
6:	learn: 0.6379162	total: 64.4ms	remaining: 9.14s
7:	learn: 0.6322371	total: 66.9ms	remaining: 8.3s
8:	learn: 0.6252933	total: 69.3ms	remaining: 7.63s
9:	learn: 0.6199712	total: 71.7ms	remaining: 7.09s
10:	learn: 0.6135072	total: 74.1ms	remaining: 6.66s
11:	learn: 0.6075828	total: 76.5ms	remaining: 6.3s
12:	learn: 0.6018339	total: 79ms	remaining: 6s
13:	learn: 0.5969033	total: 81.4ms	remaining: 5.74s
14:	learn: 0.5925719	total: 83.8ms	remaining: 5.5s
15:	learn: 0.5873056	total: 86.2ms	remaining: 5.3s
16:	learn: 0.5832024	total: 88.8ms	remaining: 5.13s
17:	learn: 0.5791578	total: 91.2ms	remaining: 4.98s
18:	learn: 0.5759252	total: 94.3ms	remaining: 4.87s
19:

Объединим обучение и трансформацию в единый пайплайн

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from category_encoders import CatBoostEncoder

X_tr, X_val, y_tr, y_val = train_test_split(data, data['target'], stratify=data['target'])

preprocessor = ColumnTransformer(
    [
    ('binary', OneHotEncoder(drop='if_binary'), binary_cols),
    ('cat', CatBoostEncoder(), non_binary_cat_cols),
    ('num', StandardScaler(), num_cols)
    ],
    remainder='drop',
    verbose_feature_names_out=False
)
model = CatBoostClassifier(auto_class_weights='Balanced')

# создайте пайплайн
pipeline = Pipeline(
	[
        ('data_trepea',preprocessor),
        ('catboost_lean',model)
    ]
)

pipeline.fit(X_tr,y_tr)
# обучите пайплайн
# ваш код здесь #
y_pred = pipeline.predict(X_val)
# получите предсказания для тестовой выборки
# ваш код здесь #

print('f1:', f1_score(y_val, y_pred))
print('roc_auc:', roc_auc_score(y_val, y_pred))

Learning rate set to 0.020969
0:	learn: 0.6843737	total: 3.6ms	remaining: 3.59s
1:	learn: 0.6768360	total: 6.49ms	remaining: 3.24s
2:	learn: 0.6683054	total: 9.33ms	remaining: 3.1s
3:	learn: 0.6596139	total: 11.8ms	remaining: 2.94s
4:	learn: 0.6525443	total: 14.3ms	remaining: 2.84s
5:	learn: 0.6451293	total: 16.9ms	remaining: 2.8s
6:	learn: 0.6380523	total: 19.8ms	remaining: 2.8s
7:	learn: 0.6325123	total: 22.7ms	remaining: 2.81s
8:	learn: 0.6257017	total: 25.4ms	remaining: 2.8s
9:	learn: 0.6201727	total: 27.8ms	remaining: 2.76s
10:	learn: 0.6135420	total: 30.3ms	remaining: 2.73s
11:	learn: 0.6076564	total: 32.7ms	remaining: 2.69s
12:	learn: 0.6018140	total: 35.2ms	remaining: 2.67s
13:	learn: 0.5962240	total: 37.6ms	remaining: 2.65s
14:	learn: 0.5919012	total: 40.1ms	remaining: 2.63s
15:	learn: 0.5867849	total: 42.5ms	remaining: 2.61s
16:	learn: 0.5828334	total: 45.1ms	remaining: 2.61s
17:	learn: 0.5789856	total: 47.5ms	remaining: 2.59s
18:	learn: 0.5755070	total: 50ms	remaining: 2.58s