In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
seed = 739

# Обработка данных

## Загрузка данных

In [3]:
train_df = pd.read_csv('data/train.csv', delimiter=',')
# cntrbtrs = pd.read_csv('data/cntrbtrs.csv', delimiter=';')
# trnsctns = pd.read_csv('data/trnsctns.csv', delimiter=';')

Берем данные только с 2003 года

In [4]:
train_df = train_df.loc[train_df['year'] >= 2003]

## Добавление новых столбцов

In [5]:
currency_df = pd.read_csv('external_data/currency.csv')
GDP_df = pd.read_csv('external_data/GDP.csv')
inflation_df = pd.read_csv('external_data/inflation.csv')
unemployment_df = pd.read_csv('external_data/Unemployment.csv')

In [6]:
for i, quarter in tqdm(enumerate(currency_df.loc[:, 'quarter'])):
    train_df.loc[train_df['quarter'] == quarter, 'currency'] = currency_df.iloc[i, 1]
    train_df.loc[train_df['quarter'] == quarter, 'GDP'] = GDP_df.iloc[i, 1]
    train_df.loc[train_df['quarter'] == quarter, 'inflation'] = inflation_df.iloc[i, 1]
    train_df.loc[train_df['quarter'] == quarter, 'unemployment'] = unemployment_df.iloc[i, 0]
train_df.head(5)

84it [00:51,  1.63it/s]


Unnamed: 0,slctn_nmbr,client_id,npo_account_id,npo_accnts_nmbr,pmnts_type,year,quarter,gender,age,clnt_cprtn_time_d,...,region,citizen,fact_addrss,appl_mrkr,evry_qrtr_pmnt,churn,currency,GDP,inflation,unemployment
0,1,0xA095932790098744A2325A8D152C05C7,0xD4DBBAC6561929409BA45725A220613E,1,2,2017,2017Q4,1,48,8091,...,САМАРСКАЯ ОБЛ,-1,-1,0,0,0,58.402211,25621.2,2.583333,5.07
1,0,0xF75BAA20A603A2479005DEB80464336E,0x3DCFF11974B3D8458CE3569DE9E855D5,2,2,2009,2009Q1,1,35,3303,...,ЛИПЕЦКАЯ ОБЛ,-1,1,0,1,0,34.392777,8334.6,13.726667,8.87
2,2,0x0D18081E0556AF4F95A931FBA4159D52,0x9DADF88CB3407C4E89403315F640393E,1,1,2010,2010Q1,1,47,1163,...,ОРЛОВСКАЯ ОБЛ,1,1,0,1,0,29.842293,9995.8,7.22,8.67
3,0,0x0AD72BC5475D9A4D83E12350A84F839D,0xF59C4E62F115684899EAE9B5D5ECF70F,1,1,2005,2005Q4,-1,42,47,...,СВЕРДЛОВСКАЯ ОБЛ,1,1,0,1,0,28.713011,6228.1,11.286667,7.0
4,0,0x73DF06339CC25840838408BAD52BE223,0x35BFFA05321C014B9FC29A189E89EDF0,2,2,2007,2007Q4,1,42,876,...,ЛИПЕЦКАЯ ОБЛ,1,1,0,1,0,24.648172,9797.0,11.396667,5.8


## Обработка пропущенных данных

### Train.csv

Пустые столбцы:
- frst_pmnt_date
- lst_pmnt_date_per_qrtr
- postal_code
- region

Вычислим регионы по почтовым индексам

In [7]:
buffer = train_df.loc[train_df['region'].notna() & train_df['postal_code'].notna() & train_df['postal_code'] > 0]
dict_ = {str(code)[-3:]: region for (region, code) in zip(buffer['region'], buffer['postal_code'])}

buffer = train_df.loc[train_df['region'].isna() & train_df['postal_code'].notna()]

In [8]:
dict_["0"] = np.NAN
list_ = []

for i, row in buffer.iterrows():
    code = str(row['postal_code'])[-3:]
    list_.append(dict_[code])

train_df.loc[train_df.loc[:, 'region'].isna() & train_df.loc[:, 'postal_code'].notna(), 'region'] = pd.Series(list_)

Проверяем пустые строки

In [9]:
train_df['region'].fillna('Неизвестен', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['region'].fillna('Неизвестен', inplace=True)


### Удаляем лишние столбцы
- slctn_nmbr
- client_id
- npo_account_id
- year
- frst_pmnt_date
- lst_pmnt_date_per_qrtr
- pmnts_sum_per_year
- pmnts_nmbr_per_year
- incm_per_year
- phone_number
- email
- postal_code

In [10]:
train_df.drop(columns=[
    'postal_code',
    'slctn_nmbr',
    'client_id',
    'npo_account_id',
    'year',
    'frst_pmnt_date',
    'lst_pmnt_date_per_qrtr'
], inplace=True)

## Графики

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

Берем столбцы по типу данных

In [12]:
int_cols = train_df.select_dtypes(int).columns.to_list()
float_cols = train_df.select_dtypes(float).columns.to_list()
cat_cols = train_df.select_dtypes(object).columns.to_list()
display(int_cols, float_cols, cat_cols)

['npo_accnts_nmbr',
 'pmnts_type',
 'gender',
 'age',
 'clnt_cprtn_time_d',
 'actv_prd_d',
 'lst_pmnt_rcnc_d',
 'pmnts_nmbr',
 'pmnts_nmbr_per_qrtr',
 'pmnts_nmbr_per_year',
 'phone_number',
 'email',
 'lk',
 'assignee_npo',
 'assignee_ops',
 'citizen',
 'fact_addrss',
 'appl_mrkr',
 'evry_qrtr_pmnt',
 'churn']

['balance',
 'oprtn_sum_per_qrtr',
 'oprtn_sum_per_year',
 'frst_pmnt',
 'lst_pmnt',
 'pmnts_sum',
 'pmnts_sum_per_qrtr',
 'pmnts_sum_per_year',
 'incm_sum',
 'incm_per_qrtr',
 'incm_per_year',
 'mgd_accum_period',
 'mgd_payment_period',
 'currency',
 'GDP',
 'inflation',
 'unemployment']

['quarter', 'region']

Убираем выбросы

In [13]:
def selection_data(df: pd.DataFrame, cols: list[str], n=1.5):
    for col in cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lim_min = q1 - n * iqr
        lim_max = q3 + n * iqr
        
        df = df.loc[(df[col] >= lim_min) & (df[col] <= lim_max)]
    return df


buffer = train_df.copy()
buffer = selection_data(buffer, ['balance', 'oprtn_sum_per_qrtr',
                                 'frst_pmnt', 'lst_pmnt',
                                 'pmnts_sum', 'pmnts_sum_per_qrtr',
                                 'incm_sum', 'incm_per_qrtr',
                                 'pmnts_sum_per_year',
                                 'oprtn_sum_per_year', 'incm_per_year',
                                 'age'])
len(buffer), len(train_df)

(1008745, 3377386)

## Преобразование данных

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

class IntegerFeaturesTransformer(BaseEstimator, TransformerMixin):
    dataframe: pd.DataFrame | None = None
    
    def fit(self, df: pd.DataFrame, y: None = None) -> BaseEstimator:
        self.dataframe = df
        # self.dataframe['actv_prd_d'] = (self.dataframe['actv_prd_d'] - self.dataframe['actv_prd_d'].mean()) / self.dataframe['actv_prd_d'].std()
        # self.dataframe['lst_pmnt_rcnc_d'] = (self.dataframe['lst_pmnt_rcnc_d'] - self.dataframe['lst_pmnt_rcnc_d'].mean()) / self.dataframe['lst_pmnt_rcnc_d'].std()
        self.dataframe['gender'] = self.dataframe['gender'].replace([1, -1], [1, 0])
        self.dataframe['has_communication'] = (self.dataframe['email'] == 1) | (self.dataframe['phone_number'] == 1)
        self.dataframe.drop(columns=['email', 'phone_number', 'churn'], inplace=True)
        
        return self
    
    def transform(self, X: None = None) -> pd.DataFrame:
        return self.dataframe
    

trans = IntegerFeaturesTransformer()
trans.fit(buffer.copy())
trans.transform(buffer).head(5)

Unnamed: 0,npo_accnts_nmbr,pmnts_type,quarter,gender,age,clnt_cprtn_time_d,actv_prd_d,lst_pmnt_rcnc_d,balance,oprtn_sum_per_qrtr,...,region,citizen,fact_addrss,appl_mrkr,evry_qrtr_pmnt,currency,GDP,inflation,unemployment,has_communication
0,1,2,2017Q4,1,48,8091,0,6757,679.37,31.34,...,САМАРСКАЯ ОБЛ,-1,-1,0,0,58.402211,25621.2,2.583333,5.07,False
2,1,1,2010Q1,1,47,1163,1121,34,5569.85,463.41,...,ОРЛОВСКАЯ ОБЛ,1,1,0,1,29.842293,9995.8,7.22,8.67,False
3,1,1,2005Q4,0,42,47,0,0,81.37,81.37,...,СВЕРДЛОВСКАЯ ОБЛ,1,1,0,1,28.713011,6228.1,11.286667,7.0,False
4,2,2,2007Q4,1,42,876,853,30,6937.65,794.36,...,ЛИПЕЦКАЯ ОБЛ,1,1,0,1,24.648172,9797.0,11.396667,5.8,False
9,2,1,2007Q3,0,25,175,153,28,1017.85,343.96,...,ЛИПЕЦКАЯ ОБЛ,1,1,0,1,25.503326,8902.7,8.88,5.53,False


In [15]:
from sklearn.preprocessing import LabelEncoder


class CatFeaturesTransformer(BaseEstimator, TransformerMixin):
    dataframe: pd.DataFrame | None = None
    
    def fit(self, X: pd.DataFrame) -> BaseEstimator:
        self.dataframe = X
        self.dataframe['quarter'] = self.dataframe['quarter'].replace(r'\d{4}Q', '', regex=True)
        scaler1 = LabelEncoder().fit(self.dataframe['region'])
        self.dataframe['region'] = scaler1.transform(self.dataframe['region'])
        return self
    
    def transform(self, X: None = None) -> np.ndarray:
        return self.dataframe


trans = CatFeaturesTransformer()
trans.fit(buffer.copy())
trans.transform(buffer)

Unnamed: 0,npo_accnts_nmbr,pmnts_type,quarter,gender,age,clnt_cprtn_time_d,actv_prd_d,lst_pmnt_rcnc_d,balance,oprtn_sum_per_qrtr,...,region,citizen,fact_addrss,appl_mrkr,evry_qrtr_pmnt,churn,currency,GDP,inflation,unemployment
0,1,2,4,1,48,8091,0,6757,679.37,31.34,...,176,-1,-1,0,0,0,58.402211,25621.2,2.583333,5.07
2,1,1,1,1,47,1163,1121,34,5569.85,463.41,...,154,1,1,0,1,0,29.842293,9995.8,7.220000,8.67
3,1,1,4,-1,42,47,0,0,81.37,81.37,...,186,1,1,0,1,0,28.713011,6228.1,11.286667,7.00
4,2,2,4,1,42,876,853,30,6937.65,794.36,...,122,1,1,0,1,0,24.648172,9797.0,11.396667,5.80
9,2,1,3,-1,25,175,153,28,1017.85,343.96,...,122,1,1,0,1,0,25.503326,8902.7,8.880000,5.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3589885,1,3,4,-1,52,857,0,855,910.85,47.33,...,182,1,1,0,0,0,72.654305,40111.5,8.310000,4.30
3589889,1,3,3,1,38,2638,2614,28,30285.05,975.61,...,38,-1,1,0,1,0,65.551482,27196.8,2.986667,4.60
3589906,2,2,2,1,27,198,183,30,552.28,236.59,...,237,-1,-1,0,0,0,28.005323,14434.8,9.540000,6.43
3589910,1,2,4,1,31,1616,0,731,3080.98,118.50,...,38,1,1,0,0,0,24.648172,9797.0,11.396667,5.80


In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


offices = ["МОСКВА Г", "САНКТ-ПЕТЕРБУРГ Г", "ВОЛОГОДСКАЯ ОБЛ", "ОРЛОВСКАЯ ОБЛ", "ЛИПЕЦКАЯ ОБЛ", "РОСТОВСКАЯ ОБЛ", "НИЖЕГОРОДСКАЯ ОБЛ", "ПЕРМСКИЙ КРАЙ", "ЕКАТЕРИНБУРГ Г", "ЧЕЛЯБИНСК Г", "НОВОСИБИРСК Г", "ИРКУТСКАЯ ОБЛ"]
buffer['has_close_office'] = buffer['region'].copy()
buffer['has_close_office'] = buffer['has_close_office'].apply(lambda x: x in offices)

transformer = ColumnTransformer(
    transformers=(
        ('int_trans', IntegerFeaturesTransformer(), int_cols),
        ('cat_trans', CatFeaturesTransformer(), cat_cols),
        ('float', 'passthrough', float_cols)
    )
)

transformer.fit(buffer.copy())

X, y = transformer.transform(buffer).astype(np.float32), buffer['churn'].to_numpy(dtype=np.int16)

In [17]:
X.shape, y.shape

((1008745, 37), (1008745,))

# Оптимизация гиперпараметров

In [18]:
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [83]:
import optuna
from catboost import Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


def cat_objective(trial):
    param = {
        "objective": "Logloss",
        # "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 3, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
        "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        )
    }
    clf = CatBoostClassifier(**param, random_state=seed, task_type='GPU')
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.3,
                                                        shuffle=True,
                                                        random_state=seed)
    
    train_data = Pool(data=X_train, label=y_train)
    test_data = Pool(data=X_test, label=y_test)
    clf.fit(train_data, verbose=False)
    y_pred = clf.predict(test_data)
    
    score = f1_score(y_test, y_pred)
    return score


def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 150, 400, step=10),
        'max_depth':trial.suggest_int('max_depth', 3, 9),
        'min_child_weight': trial.suggest_int('min_child_weight', 7, 8),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True), 
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 10.),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 10.),
        'gamma': trial.suggest_float('gamma', 0.7, 1.0, step=0.1),
        'device': 'cuda'
    }
    clf = XGBClassifier(**params, random_state=seed)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.3,
                                                        shuffle=True,
                                                        random_state=seed)
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    score = f1_score(y_test, y_pred)
    return score


def lgbm_objective(trial):
    params = {
        'metric': 'f1_score',
        'n_estimators': trial.suggest_int('n_estimators', 150, 400, step=10),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_int('max_depth', 12, 20),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    clf = LGBMClassifier(**params, random_state=seed, verbosity=-1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.3,
                                                        shuffle=True,
                                                        random_state=seed)
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    score = f1_score(y_test, y_pred)
    return score


objectives = [
    # ('xgb', xgb_objective),
    # ('cat', cat_objective),
    ('lgbm', lgbm_objective)
]

In [84]:
ga_sampler = optuna.samplers.NSGAIISampler(
    population_size=50,
    mutation_prob=0.14,
    crossover=optuna.samplers.nsgaii.VSBXCrossover()
)


studies: list[optuna.Study] = [None] * len(objectives)
for i, (name, objective) in enumerate(objectives):
    studies[i] = optuna.create_study(study_name=name, direction='maximize', sampler=ga_sampler)
    studies[i].optimize(objective, n_trials=150)

  crossover=optuna.samplers.nsgaii.VSBXCrossover()
[I 2024-04-13 21:08:02,129] A new study created in memory with name: lgbm
[W 2024-04-13 21:08:08,948] Trial 0 failed with parameters: {'n_estimators': 190, 'reg_alpha': 0.540103369645649, 'reg_lambda': 0.22841627018027605, 'colsample_bytree': 0.5, 'subsample': 0.8, 'learning_rate': 0.014, 'max_depth': 14, 'num_leaves': 488, 'min_child_samples': 205, 'min_data_per_groups': 31} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/danila/Documents/Python Projects/AI Hackathon 12.04.2024/AI-Hackathon-SFD/.venv/lib/python3.11/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/1n/ncv2knx92312c9fjnfycb5k80000gn/T/ipykernel_68547/1848864253.py", line 79, in lgbm_objective
    clf.fit(X_train, y_train)
  File "/Users/danila/Documents/Python Projects/AI Hackathon 12.04.2024/AI-Hackathon-SFD/.ve

KeyboardInterrupt: 

In [64]:
for study in studies:
    display(study.best_params)

ValueError: No trials are completed yet.