In [1]:
import pandas as pd

In [2]:
import numpy as np

### Основные идеи:

1) Сильными признаками оказались временные признаки: возраст вагона, время, оставшееся до планового ремонта, время, оставшееся до истечения срока службы

2) Признаки, связанные с номером станции, на которой в последний раз был вагон, также внесло сильный вклад в решение. Мы связавыем это с тем, что с определенных станций вагоны отправляются в депо раньше остальных (например, потому что ремонт там стоит дешевле)

3) Статистки за последний месяц помогают определить интенсивность нагрузки на вагон и являются сильной фичей

4) В качестве модели использовать градиентный бустинг из библиотеки LightGBM



#### Считываем данные

In [349]:
test = pd.read_csv('target/y_predict.csv')
test.month = pd.to_datetime(test.month)

In [313]:
def load_dislok():
    dislok = pd.read_parquet('dislok_wagons.parquet').convert_dtypes()
    feb = pd.read_parquet('test/dislok_wagons.parquet').convert_dtypes()
    dislok = pd.concat([dislok, feb]).reset_index()
    dislok.plan_date = pd.to_datetime(dislok.plan_date)
    return dislok

def load_targs():
    train = pd.read_csv('target/y_train.csv')
    feb = pd.read_csv('test/target/y_test.csv')
    train = pd.concat([train, feb]).reset_index()
    return train

def load_pr_rems():
    pr_rem1 = pd.read_parquet('./pr_rems.parquet').convert_dtypes()
    pr_rem2 = pd.read_parquet('./test/pr_rems.parquet').convert_dtypes()
    return pd.concat([pr_rem1, pr_rem2])

def load_tr_rems():
    tr_rem1 = pd.read_parquet('./tr_rems.parquet').convert_dtypes()
    tr_rem2 = pd.read_parquet('./test/tr_rems.parquet').convert_dtypes()
    return pd.concat([tr_rem1, tr_rem2]).reset_index()

#### Функция Считает статистики за предыдущий месяц. Достаем инфу из dislok в словарь, разворачиваем на таргет

In [314]:
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm.notebook import tqdm as tqdm

def process_ost_prob(accum, load_accum):
    
    
    accum = np.array(accum)
    load_accum = np.array(load_accum)
    
    
    if(np.isnan(accum).any()):
        return np.array([-1, -1, -1, -1, -1, -1])
    
    mask = accum != 160000
    accum = accum[mask]
    lam = load_accum[mask]
    
    if(len(accum) == 0):
        return np.array([0, 0, 0, 0, 0, 0])
    
    path = 0
    
    val_acc = []
    
    under_load = 0
    
    for i in range(len(accum) - 1):
        if(accum[i] < accum[i + 1]):
            pass
        else:
            val_acc.append(accum[i] - accum[i + 1])
            if(lam[i] == 1):
                under_load += accum[i] - accum[i + 1]
            
    return np.array([sum(val_acc), np.mean(val_acc), np.std(val_acc), sum(load_accum), under_load, under_load/(sum(val_acc)+1e-7)])

def unfold_stats_applyer(d, x):
    m = x["month"]
    cm = int(m[5:7])
    wn = x["wagnum"]
    if((wn, cm - 1) in d):
        ans = d[(wn, (cm - 1)%12 )]
    else:
        ans = np.array([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
    
    return pd.Series(data=ans, index=["last_month_run", "last_month_mean", "last_month_std", "under_load_days", "under_load_dist", "under_load_percent"])


class DislokStatExctractor(BaseEstimator, TransformerMixin):
    def __init__(self, dislok):
        dislok_sorted = dislok.sort_values(["wagnum", "plan_date"])
        
        plandat = dislok_sorted.plan_date.dt.month.values.copy()
        wagons = dislok_sorted.wagnum.values.copy()
        ost_prob = dislok_sorted.ost_prob.values.copy()

        is_load = dislok_sorted.isload.values.copy()
        
        del dislok_sorted
        
        accum = []
        load_accum = []

        d = {}
        for i in tqdm(range(len(plandat) - 1)):
            cm = plandat[i]
            nm = plandat[i + 1]
            wn = wagons[i]
            nwn = wagons[i + 1]

            if(ost_prob[i] is pd.NA):
                accum.append(np.nan)
            else:
                accum.append(ost_prob[i])
            load_accum.append(is_load[i])

            if(cm == nm)and(wn == nwn):
                pass
            else:
                d[(wn, cm%12)] = process_ost_prob(accum, load_accum)
                accum = []
                load_accum = []
                
        self.d_stats = d
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X.apply(lambda x: unfold_stats_applyer(self.d_stats, x), axis=1)

#### Время до ближайшего планового ремонта. Сохраняем все времена из таблицы по ключу вагона, затем для каждого вагона из taget ищим минимальную полжительную разницу в словаре.

In [315]:
class NearestRepair(BaseEstimator, TransformerMixin):
  def __init__(self, base_date_name="month", repair_date_name="date_pl_rem"):
    self.base_date_name = base_date_name
    self.repair_date_name = repair_date_name
    self.wagnum = "wagnum"
    self.bound_date = pd.to_datetime('2022-01-01')

  def fit(self, dislok):
    rep_dates = {}
    for idx, data in tqdm(dislok[[self.wagnum, self.repair_date_name]].groupby(self.wagnum)):
      rep_dates[idx] = np.array((pd.to_datetime(data[self.repair_date_name].unique()) - self.bound_date).total_seconds(), dtype=np.int64)
    self.rep_dates = rep_dates
    return self

  def transform(self, wagons):
    wagons["nearrepair"] = pd.Series((wagons[self.base_date_name] - self.bound_date).dt.total_seconds(), dtype=np.int64)
    wagons = wagons.reset_index()
    for i, row in tqdm(wagons.iterrows()):
        if row[self.wagnum] in self.rep_dates:
            seconds = self.rep_dates[row[self.wagnum]] - row["nearrepair"]
            seconds = seconds[seconds >= 0]
            if len(seconds):
                seconds = min(seconds)
            else:
                seconds = pd.NA
        else:
            seconds = pd.NA
        wagons.loc[i, "nearrepair"] = seconds
    return wagons

#### Далее мы обрабатываем таблицу Dislok, и добавляем фичи из нее, на какой станции остановился поезд в первое число каждого месяца, остановился ли он у станции, у которых часто происходят ремонтные работы, сколько пробега у него осталось на первое число каждого месяца и сколько у него времени осталось до планового ремонта, а также код ремонта

In [316]:
from sklearn.base import BaseEstimator, TransformerMixin
from collections import Counter 

class DislokTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, times, ids, stat_remont):
        dislok = pd.read_parquet('dislok_wagons.parquet').convert_dtypes()
        feb = pd.read_parquet('test/dislok_wagons.parquet').convert_dtypes()
        self.dislok = pd.concat([dislok, feb])
        self.dislok = self.dislok[['plan_date', 'wagnum', 'ost_prob', 'kod_vrab', 'date_pl_rem', 'isload', 'distance', 'st_id_send', 'st_id_dest']]
        self.dislok.plan_date = pd.to_datetime(self.dislok.plan_date)
        self.dislok.st_id_send = self.dislok.st_id_send.apply(hash)
        self.dislok.st_id_dest = self.dislok.st_id_dest.apply(hash)
        self.dislok = self.dislok.sort_values(by=['wagnum', 'plan_date']).reset_index()
        self.stat_remont = stat_remont
        self.times = times
        self.ids = ids
        self.wagon_probeg = {}
        self.wagon_kod_remont = {}
        self.time_remont = {}
        self.stat_remont = {}
        self.cur_st_send = {}
        self.cur_st_dest = {}
        
    def fit(self, train):
        index_train = 0
        for train_id in tqdm(self.ids):
            while self.dislok.loc[index_train, 'wagnum'] != train_id:
                index_train += 1
            self.wagon_kod_remont[train_id] = self.dislok.loc[index_train, 'kod_vrab']
            self.time_remont[train_id] = self.dislok.loc[index_train, 'date_pl_rem']
            for index, train_time in enumerate(self.times):
                while self.dislok.loc[index_train, 'plan_date'] < train_time and self.dislok.loc[index_train, 'wagnum'] == train_id:
                    index_train += 1
                if not index:
                    if self.dislok.loc[index_train, 'plan_date'] == train_time and self.dislok.loc[index_train, 'wagnum'] == train_id:
                        self.wagon_probeg[train_id] = [self.dislok.loc[index_train, 'ost_prob']]
                        self.stat_remont[train_id] = [self.dislok.loc[index_train, 'st_id_send'] in self.stat_remont or self.dislok.loc[index_train, 'st_id_dest'] in self.stat_remont]
                        self.cur_st_send[train_id] = [self.dislok.loc[index_train, 'st_id_send']]
                        self.cur_st_dest[train_id] = [self.dislok.loc[index_train, 'st_id_dest']]
                    else:
                        self.wagon_probeg[train_id] = [pd.NA]
                        self.stat_remont[train_id] = [pd.NA]
                        self.cur_st_send[train_id] = [pd.NA]
                        self.cur_st_dest[train_id] = [pd.NA]
                else:
                    if self.dislok.loc[index_train, 'plan_date'] == train_time and self.dislok.loc[index_train, 'wagnum'] == train_id:
                        self.wagon_probeg[train_id].append(self.dislok.loc[index_train, 'ost_prob'])
                        self.stat_remont[train_id].append(self.dislok.loc[index_train, 'st_id_send'] in self.stat_remont or self.dislok.loc[index_train, 'st_id_dest'] in self.stat_remont)
                        self.cur_st_send[train_id].append(self.dislok.loc[index_train, 'st_id_send'])
                        self.cur_st_dest[train_id].append(self.dislok.loc[index_train, 'st_id_dest'])
                    else:
                        self.wagon_probeg[train_id].append(pd.NA)
                        self.stat_remont[train_id].append(pd.NA)
                        self.cur_st_send[train_id].append(pd.NA)
                        self.cur_st_dest[train_id].append(pd.NA)
        return self
        
    def transform(self, test):
        test = test.copy()
        test['probeg'] = 0
        test['kod_remont'] = 0
        test['time_before_remont'] = 0
        test['stat_remont'] = 0
        test['cur_st_send'] = 0
        test['cur_st_dest'] = 0
        for i, row in tqdm(test.iterrows()):
            if row['month'].month >= 8:
                month = row['month'].month - 8
            else:
                month = row['month'].month + 4
            test.loc[i, 'probeg'] = self.wagon_probeg[row['wagnum']][month]
            test.loc[i, 'stat_remont'] = self.stat_remont[row['wagnum']][month]
            test.loc[i, 'cur_st_send'] = self.cur_st_send[row['wagnum']][month]
            test.loc[i, 'cur_st_dest'] = self.cur_st_dest[row['wagnum']][month]
            test.loc[i, 'kod_remont'] = self.wagon_kod_remont[row['wagnum']]
            test.loc[i, 'time_before_remont'] = (self.time_remont[row['wagnum']] - row['month']).days
        test.stat_remont = test.stat_remont.fillna(-1).astype(int)
        return test
    

#### Добавляем фичи возраста вагона (из фичи года постройки вагона), времени до истечения срока службы и количество ремонтов

In [None]:
class WagParamsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.wag_params = pd.read_parquet('wag_params.parquet').convert_dtypes().sort_values(by=['wagnum']).reset_index()
        self.wag_params.date_build = pd.to_datetime(self.wag_params.date_build)
        self.wag_params.srok_sl = pd.to_datetime(self.wag_params.srok_sl)
        self.srok_sl = {}
        self.age = {}
        for i, row in self.wag_params.iterrows():
            self.srok_sl[i] = row['srok_sl']
            self.age[i] = row['date_build']
        
        tr_rems = pd.read_parquet('tr_rems.parquet').convert_dtypes()
        feb = pd.read_parquet('test/tr_rems.parquet').convert_dtypes()
        self.tr_rems = pd.concat([tr_rems, feb]).sort_values(by=['wagnum']).reset_index()
        self.tr_rems = self.tr_rems.groupby('wagnum', as_index= False).kod_vrab.count()
        self.tr_rems = dict(zip(self.tr_rems.wagnum.to_list(), self.tr_rems.kod_vrab.to_list()))
        
    def fit(self, train):
        return self

    def transform(self, test):
        test = test.copy()
        test['time_before_srok_sl'] = -10000
        test['age'] = -10000
        test['kod_vrab_tr_rems'] = -10000
        for i, row in test.iterrows():
            test.loc[i, 'time_before_srok_sl'] = (self.srok_sl[row['wagnum']] - row['month']).days
            test.loc[i, 'age'] = (row['month'] - self.age[row['wagnum']]).days
            if row['wagnum'] in self.tr_rems:
                test.loc[i, 'kod_vrab_tr_rems'] = self.tr_rems[row['wagnum']]
        return test

#### Проходим по owner_ship и извлекаем manage_type и rod_id(значение одно и тоже для всех записей относящихся к конкретному вагону). rod_id дяльше не участвует т.к берется из другой таблицы.

In [317]:
class OwnerShipExtracor(BaseEstimator, TransformerMixin):
    def __init__(self, wagprob):
        tables = {}

        for col in ["manage_type", "rod_id"]:
            d = {}

            for w, day in tqdm(zip(wag_prob.wagnum, wag_prob[col])):
                if(w in d):
                    if not(day in d[w]):
                        d[w].add(day)
                else:
                    d[w] = set([day])

            dd = {"wagnum":[], col:[]}

            for k, v in d.items():
                assert len(v) == 1
                for x in v:
                    break
                dd["wagnum"].append(k)
                dd[col].append(x)
            print(col) 
            tables[col] = pd.DataFrame.from_dict(dd) #.rod_id.value_counts()
        
        tb = tables["manage_type"].copy()
        self.tables = (tb.merge(tables["rod_id"].copy(), how="left", on="wagnum")).copy()
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return (X.merge(self.tables.copy(), how="left", on="wagnum")).copy()
        

#### Время до ближайшего ремонта, который был. Принцип генерации точно такой же как и ближайшего запланированного, тролько теперь рассматриваем противоположенные по знаку разницы

In [318]:
class LastRepair(BaseEstimator, TransformerMixin):
  def __init__(self):
    self.bound_date = pd.to_datetime('2022-01-01')
    self.prev_repairs = {}

  #pr_rems
  def fit(self, train):
    rep_dates = {}
    for idx, data in tqdm(train[["wagnum", "rem_month"]].groupby("wagnum")):
      rep_dates[idx] = pd.Series((pd.to_datetime(data["rem_month"].unique()) - self.bound_date).total_seconds(), dtype=np.int64)
    self.rep_dates = rep_dates
    return self

  #target
  def transform(self, test):
    test["lastrepair"] = pd.Series((pd.to_datetime(test["month"]) - self.bound_date).dt.total_seconds(), dtype=np.int64)
    for i, row in tqdm(test.iterrows()):
        if row["wagnum"] in self.rep_dates:
            seconds = self.rep_dates[row["wagnum"]] - row["lastrepair"]
            seconds = -seconds[seconds <= 0]
            cnt_repairs = len(seconds)
            if len(seconds):
                seconds = min(seconds)
            else:
                seconds = -1
        else:
            seconds = -1
            cnt_repairs = 0
        test.loc[i, "cnt_repairs"] = cnt_repairs
        test.loc[i, "lastrepair"] = seconds
    return test

In [353]:
dislok = load_dislok()
targs = load_targs()

In [320]:
targs_copy = targs.copy() 
#pd.to_datetime(dislok.plan_date)
targs_copy.month = pd.to_datetime(targs_copy.month)

In [322]:
test = pd.read_csv('target/y_predict.csv')
test.month = pd.to_datetime(test.month)
train_times = sorted(list(set(pd.concat([targs_copy, test])['month'].to_list())))
train_id = sorted(list(set(targs_copy['wagnum'].to_list())))
tr_rem = pd.read_parquet('tr_rems.parquet').convert_dtypes()
feb = pd.read_parquet('test/tr_rems.parquet').convert_dtypes()
tr_rem = pd.concat([tr_rem, feb]).sort_values(by=['wagnum']).reset_index()
stat_remont = Counter(tr_rem.st_id_send.to_list()).most_common(11)
stat_remont = [i[0] for i in stat_remont]
transformer = DislokTransformer(train_times, train_id, stat_remont)
transformer.fit(targs_copy)
new_train = transformer.transform(targs_copy)

  0%|          | 0/33977 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [324]:
ef_test = transformer.transform(test.copy())

0it [00:00, ?it/s]

In [326]:
ef_test = ef_test[["probeg", "kod_remont", "time_before_remont", "stat_remont", "cur_st_send", "cur_st_dest"]].copy()
ef_test.probeg = ef_test.probeg.fillna(-10000)

In [55]:
ef = new_train[["probeg", "kod_remont", "time_before_remont", "stat_remont", "cur_st_send", "cur_st_dest"]].copy()

In [56]:
ef.probeg = ef.probeg.fillna(-10000)

In [328]:
transformer = WagParamsTransformer()
eff = transformer.transform(new_train)
eff = eff[['time_before_srok_sl', 'age', 'kod_vrab_tr_rems']].copy()

In [329]:
cat_feat = np.array(['cur_st_dest', 'cur_st_send', 'stat_remont', 'kod_remont'])
for p in cat_feat:
    ef[p] = ef[p].astype('category')
eff['kod_vrab_tr_rems'] = eff['kod_vrab_tr_rems'].astype('category')

In [330]:
eff_test = transformer.transform(test.copy())

In [332]:
eff_test = eff_test[['time_before_srok_sl', 'age', 'kod_vrab_tr_rems']].copy()
for p in cat_feat:
    ef_test[p] = ef_test[p].astype('category')
eff_test['kod_vrab_tr_rems'] = eff_test['kod_vrab_tr_rems'].astype('category')

In [333]:
NR = NearestRepair()

In [334]:
NR.fit(dislok)

  0%|          | 0/33977 [00:00<?, ?it/s]

In [335]:
nerep = NR.transform(targs_copy)[["nearrepair"]]

0it [00:00, ?it/s]

In [336]:
nerep.nearrepair = nerep.nearrepair.fillna(-1)

In [337]:
nerep_test = NR.transform(test.copy())[["nearrepair"]]
nerep_test.nearrepair = nerep_test.nearrepair.fillna(-1)

0it [00:00, ?it/s]

In [350]:
dislok = pd.read_parquet('dislok_wagons.parquet').convert_dtypes()
feb = pd.read_parquet('test/dislok_wagons.parquet').convert_dtypes()
dislok = pd.concat([dislok, feb]).reset_index()
de = DislokStatExctractor(dislok)

  0%|          | 0/7228236 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


In [351]:
test = pd.read_csv('target/y_predict.csv')
#test.month = pd.to_datetime(test.month)

mff = de.fit_transform(targs)
mff_test = de.transform(test.copy())

In [354]:
test = pd.read_csv('target/y_predict.csv')
test.month = pd.to_datetime(test.month)

mff_test

Unnamed: 0,last_month_run,last_month_mean,last_month_std,under_load_days,under_load_dist,under_load_percent
0,10000.0,370.370370,380.690031,21.0,9815.0,0.981500
1,4241.0,157.074074,300.058010,11.0,4112.0,0.969583
2,4338.0,160.666667,262.711361,9.0,4019.0,0.926464
3,9939.0,368.111111,270.018426,14.0,5058.0,0.508904
4,4590.0,170.000000,227.684642,11.0,2524.0,0.549891
...,...,...,...,...,...,...
33702,4561.0,168.925926,232.270903,12.0,3049.0,0.668494
33703,3661.0,135.592593,228.671244,9.0,1345.0,0.367386
33704,3434.0,127.185185,220.613125,6.0,1960.0,0.570763
33705,5872.0,217.481481,268.502896,19.0,5626.0,0.958106


In [355]:

wag_prob = pd.read_parquet('./wagons_probeg_ownersip.parquet').convert_dtypes()

oe = OwnerShipExtracor(wag_prob)

oe.fit(targs)

own_feat = (oe.transform(targs)[["manage_type"]]).copy()

for p in ["manage_type"]:
    own_feat[p] = own_feat[p].astype('category')

0it [00:00, ?it/s]

manage_type


0it [00:00, ?it/s]

rod_id


In [356]:
own_feat_test = (oe.transform(test.copy())[["manage_type"]]).copy()
for p in ["manage_type"]:
    own_feat_test[p] = own_feat_test[p].astype('category')

In [357]:
own_feat_test

Unnamed: 0,manage_type
0,0
1,0
2,0
3,0
4,0
...,...
33702,0
33703,0
33704,0
33705,0


In [358]:

lastrep = LastRepair()

pr_rem = load_pr_rems()

lastrep.fit(pr_rem)

lastreptab = lastrep.transform(targs)[["lastrepair", "cnt_repairs"]]

  0%|          | 0/11982 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [359]:
lastreptab_test = lastrep.transform(test.copy())[["lastrepair", "cnt_repairs"]]

0it [00:00, ?it/s]

In [360]:
lastreptab_test

Unnamed: 0,lastrepair,cnt_repairs
0,8640000,1.0
1,7171200,1.0
2,7516800,1.0
3,-1,0.0
4,12614400,1.0
...,...,...
33702,-1,0.0
33703,14169600,1.0
33704,11577600,1.0
33705,15379200,1.0


#### Достаем из таблицы tr_rem количество ремонтов до нужного месяца. Подражали бэйзлайну не очень сильная фича. 

In [361]:

def unfold_trem_applyer(rem_dict, x):
    m = pd.to_datetime(x["month"])
    wn = x["wagnum"]
    
    if(wn in rem_dict):
        ans = np.array([rem_dict[wn].searchsorted(m, side="left") ])
    else:
        ans = np.array([0])
    
    return pd.Series(data=ans, index=["num_small_repairs"])


class SmallRepairsExctractor(BaseEstimator, TransformerMixin):
    def __init__(self, df):
        df = df.sort_values(["rem_month"])
        rem_dict = {}
        
        for x in tqdm(df.groupby("wagnum")):
            wg, ddf = x
            rem_dict[wg] = ddf.rem_month
        self.rem_dict = rem_dict
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X.apply(lambda x: unfold_trem_applyer(self.rem_dict, x), axis=1)

In [362]:
tr_rems = load_tr_rems()
sre = SmallRepairsExctractor(tr_rems)
trems_counts = sre.transform(targs)
trems_counts.num_small_repairs = trems_counts.num_small_repairs.astype('category')

  0%|          | 0/19621 [00:00<?, ?it/s]

In [363]:
trems_counts_test = sre.transform(test)
trems_counts_test.num_small_repairs = trems_counts_test.num_small_repairs.astype('category')

In [364]:
trems_counts_test

Unnamed: 0,num_small_repairs
0,3
1,2
2,2
3,2
4,0
...,...
33702,0
33703,0
33704,4
33705,0


#### Считаем статистики по грузам которые перевозил вагон до нужного месяца.

In [365]:
tqdm.pandas()

def get_counts(wag_groups, wagnum, month):
    count_names = {
        0: "zerostype",
        1: "onestype",
        2: "twostype"
    }
    wag_slice = wag_groups[wagnum]
    wag_slice = wag_slice[wag_slice.plan_date <= month]
    wag_slice_loads = wag_slice[wag_slice.isload == 1]
    counts = wag_slice_loads.fr_class.value_counts([0, 1, 2], dropna=True)
    if len(wag_slice):
        counts *= len(wag_slice_loads) / len(wag_slice)

    if len(wag_slice_loads):
        skoroport = float(wag_slice_loads.skoroport.sum(skipna=True)) / len(wag_slice_loads)
        naval = float(wag_slice_loads.naval.sum(skipna=True)) / len(wag_slice_loads)
        naliv = float(wag_slice_loads.naliv.sum(skipna=True)) / len(wag_slice_loads)
    else:
        skoroport = 0
        naval = 0
        naliv = 0
    data_list = list(counts.values) + [skoroport, naval, naliv]

    col_names = [count_names[idx] for idx in counts.index] + ["skoroport_frac", "naval_frac", "naliv_frac"]
    return pd.Series(data=data_list, index=col_names)

class CountsCargo(BaseEstimator, TransformerMixin):
  def __init__(self):
    self.wag_groups = {}

  #dislok, freq_info
  def fit(self, dislok, freq_info):
    dislok_cargo = dislok[["plan_date", "wagnum", "isload", "fr_id"]]
    freq_cargo = freq_info[["fr_id", "fr_class", "skoroport", "naval", "naliv"]]
    dislok_cargo = dislok_cargo.merge(freq_cargo, how="left", on="fr_id")
    for i, wag_slice in tqdm(dislok_cargo.groupby("wagnum")):
        self.wag_groups[wag_slice["wagnum"].iloc[0]] = wag_slice
    return self

  #target
  def transform(self, test):
    counts_df = test.progress_apply(lambda x: get_counts(self.wag_groups, x["wagnum"], x["month"]), axis=1)
    counts_df = pd.concat([test, counts_df], axis=1)
    counts_df[["naliv_frac", "naval_frac", "onestype", "skoroport_frac", "twostype", "zerostype"]] = \
      counts_df[["naliv_frac", "naval_frac", "onestype", "skoroport_frac", "twostype", "zerostype"]].fillna(0.0)
    return counts_df

In [366]:
dan_transformer = CountsCargo()
freq_info = pd.read_parquet('./freight_info.parquet').convert_dtypes()
dan_transformer.fit(dislok, freq_info)
danf = dan_transformer.transform(targs)
danf = danf[['naliv_frac', 'naval_frac', 'onestype', 'skoroport_frac', 'twostype', 'zerostype']]

  0%|          | 0/33977 [00:00<?, ?it/s]

  0%|          | 0/237561 [00:00<?, ?it/s]

In [367]:
danf_test = dan_transformer.transform(test)
danf_test = danf_test[['naliv_frac', 'naval_frac', 'onestype', 'skoroport_frac', 'twostype', 'zerostype']]

  0%|          | 0/33707 [00:00<?, ?it/s]

In [368]:
danf_test

Unnamed: 0,naliv_frac,naval_frac,onestype,skoroport_frac,twostype,zerostype
0,0.0,0.841121,0.000000,0.0,0.122066,0.380282
1,0.0,0.898876,0.000000,0.0,0.089202,0.328638
2,0.0,0.560000,0.000000,0.0,0.220657,0.131455
3,0.0,0.440000,0.000000,0.0,0.206573,0.145540
4,0.0,0.635514,0.037559,0.0,0.286385,0.178404
...,...,...,...,...,...,...
33702,0.0,0.740000,0.000000,0.0,0.126761,0.342723
33703,0.0,0.709677,0.028475,0.0,0.208818,0.199326
33704,0.0,1.000000,0.000000,0.0,0.000000,0.347418
33705,0.0,1.000000,0.000000,0.0,0.107981,0.389671


#### Добавляем фичи из таблици wag_params

In [371]:
wag_params = pd.read_parquet('wag_params.parquet').convert_dtypes()

cols_to_drop = ['srok_sl', 'date_build', 'date_iskl']

wag_params = wag_params.drop(columns=cols_to_drop)

cat_feat = wag_params.columns[1:].values

params = targs.merge(wag_params, on = "wagnum", how = "left")[cat_feat].copy()

for p in cat_feat:
    params[p] = params[p].astype('category')

In [373]:
params_test = test.merge(wag_params, on = "wagnum", how = "left")[cat_feat].copy()

for p in cat_feat:
    params_test[p] = params_test[p].astype('category')

In [394]:
mf_train = pd.concat([mff, ef, eff, nerep, params, own_feat, trems_counts, danf, lastreptab], axis = 1)
mf_test = pd.concat([mff_test, ef_test, eff_test, nerep_test, params_test, own_feat_test, trems_counts_test, danf_test, lastreptab_test], axis = 1)

In [390]:
test = pd.read_csv('target/y_predict.csv')
mf_train = pd.concat([targs[["wagnum", "month"]], mf_train], axis = 1)
mf_test = pd.concat([test[["wagnum", "month"]], mf_test], axis = 1)

#### Далее мы строим финальное решение

#### В качестве алгоритма машинного обучения мы использовали бустинги из библиотеки LGBMClassifier, в финальном решение мы используем ансамбль из 7 бустингом с разными random_state, усредняя ответы для них

#### Подбор гиперпараметров происходит в следующей секции

In [488]:
y_pred_day = []
for i in range(7):
    model_day = LGBMClassifier(is_unbalance=True, num_threads = 8, force_row_wise=True, **{'n_estimators': 10000, 
     'learning_rate': 0.013404981239009836 / 2, 
     'min_split_gain': 0.03842333239584914, 
     'max_depth': -1, 
     'max_cat_threshold': 32, 
     'cat_l2': 68.60685058667403,
     'random_state': i})

    model_day.fit(mf_train, targs.target_day)
    y_pred_day.append(model_day.predict(mf_test))

y_pred_day = np.concatenate([i.reshape(-1,1) for i in y_pred_day], axis=1)



In [489]:
new_ans = y_pred_day.sum(axis=1).copy()
new_ans[y_pred_day.sum(axis=1).copy() > 3] = 1
new_ans[y_pred_day.sum(axis=1).copy() <= 3] = 0
y_pred_day = new_ans

In [None]:
y_pred_month = []
for i in range(7):
    model_month = LGBMClassifier(is_unbalance=True, num_threads = 8, force_row_wise=True, **{'n_estimators': 10000, 
     'learning_rate': 0.024922405732354058 / 2, 
     'min_split_gain': 0.03126564532038549, 
     'max_depth': 10, 
     'max_cat_threshold': 128, 
     'cat_l2': 55.69886937210401,
     'random_state': i})

    model_month.fit(mf_train, targs.target_month)
    y_pred_month.append(model_month.predict(mf_test))

y_pred_month = np.concatenate([i.reshape(-1,1) for i in y_pred_month], axis=1)



In [None]:
new_ans = y_pred_month.sum(axis=1).copy()
new_ans[y_pred_month.sum(axis=1).copy() > 3] = 1
new_ans[y_pred_month.sum(axis=1).copy() <= 3] = 0
y_pred_month = new_ans

In [None]:
submit = pd.read_csv('target/y_predict_submit_example.csv')
submit.target_month = y_pred_month
submit.target_day = y_pred_day
submit.to_csv('submit3.csv', index=False)

#### Здесь мы перебирали гиперпараметры с помощью optuna
#### В качестве валидационной выборки мы использовали данные за февраль

In [404]:
data_mask = feat.month.apply(lambda x:int(x[5:7])) != 8

In [None]:
train_mask = (feat.month.apply(lambda x:int(x[5:7]))!= 8)&(feat.month.apply(lambda x:int(x[5:7]))!= 2)
test_mask = feat.month.apply(lambda x:int(x[5:7])) == 2

In [None]:
X_train = mf[train_mask]
y_train = targs[train_mask].target_month

In [None]:
X_test = mf[test_mask]
y_test = targs[test_mask].target_month

In [408]:
X = mf[data_mask]
y = targs[data_mask].target_month.values

In [413]:
import optuna
import numpy as np
#from optuna.integration import LightGBMPruningCallback
from sklearn.model_selection import StratifiedKFold

def objective(trial):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_categorical("n_estimators", [500]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        #"num_leaves": trial.suggest_int("num_leaves", 20, 300, step=20),
        #"min_child_samples": trial.suggest_int("min_child_samples", 5, 100, step=5),
        #"lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        #"lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_split_gain": trial.suggest_float("min_split_gain", 0, 0.1),
        #"subsample": trial.suggest_float(
        #    "subsample", 0.2, 0.95, step=0.1
        #),
        #"subsample_freq": trial.suggest_categorical("subsample_freq", [1]),
        #"colsample_bytree": trial.suggest_float(
        #    "colsample_bytree", 0.2, 0.95, step=0.1
        #),
        "max_depth": trial.suggest_categorical("max_depth", [-1, 3, 4, 5, 6, 8, 10]),
        "max_cat_threshold": trial.suggest_categorical("max_cat_threshold", [8, 16, 32, 64, 128]), 
        "cat_l2": trial.suggest_float("cat_l2", 5.0, 100.0)
        
    }
    
    
    lgbm = LGBMClassifier(is_unbalance=True, num_threads = 6, force_row_wise=True, **param_grid)
    cvr = []
    
    for ti, di in StratifiedKFold(n_splits=5, shuffle=True).split(X, y):
        Xt = X.iloc[ti]
        #print(ti)
        yt = y[ti]
        
        Xd = X.iloc[di]
        yd = y[di]
        
        lgbm.fit(
            Xt,
            yt
        )
        

        #lgbm.fit(Xt, yt, sample_weight = sample_weights[ti])

        ypred = lgbm.predict(Xd)
        
        
        cvr.append(f1_score(yd, ypred))

        #print(confusion_matrix(yd, [int(x) if x>=0 else 0 for x in np.round(ypred)]))

    return np.mean(cvr)


study = optuna.create_study(direction="maximize", study_name="LGBM Classifier")
study.optimize(objective, n_trials=100)

[I 2023-11-12 07:54:56,033] A new study created in memory with name: LGBM Classifier




[I 2023-11-12 07:55:15,866] Trial 0 finished with value: 0.6860678544480908 and parameters: {'n_estimators': 500, 'learning_rate': 0.2424515371972734, 'min_split_gain': 0.02862180166588827, 'max_depth': 8, 'max_cat_threshold': 32, 'cat_l2': 51.395900406164415}. Best is trial 0 with value: 0.6860678544480908.




[I 2023-11-12 07:55:34,723] Trial 1 finished with value: 0.5841128933778891 and parameters: {'n_estimators': 500, 'learning_rate': 0.021264367707282423, 'min_split_gain': 0.05720465530652167, 'max_depth': 5, 'max_cat_threshold': 64, 'cat_l2': 56.976424177648845}. Best is trial 0 with value: 0.6860678544480908.




[I 2023-11-12 07:55:47,740] Trial 2 finished with value: 0.6305560569279554 and parameters: {'n_estimators': 500, 'learning_rate': 0.24400749018113124, 'min_split_gain': 0.01932492318276946, 'max_depth': 3, 'max_cat_threshold': 32, 'cat_l2': 87.52038471015027}. Best is trial 0 with value: 0.6860678544480908.




[I 2023-11-12 07:56:11,472] Trial 3 finished with value: 0.6653335735766988 and parameters: {'n_estimators': 500, 'learning_rate': 0.07359513773373297, 'min_split_gain': 0.07227081365075748, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 38.378164154485425}. Best is trial 0 with value: 0.6860678544480908.




[I 2023-11-12 07:56:25,350] Trial 4 finished with value: 0.6394792899126136 and parameters: {'n_estimators': 500, 'learning_rate': 0.2485947758123082, 'min_split_gain': 0.04177050498346374, 'max_depth': 3, 'max_cat_threshold': 64, 'cat_l2': 65.19538928404138}. Best is trial 0 with value: 0.6860678544480908.




[I 2023-11-12 07:56:41,228] Trial 5 finished with value: 0.6475331194409701 and parameters: {'n_estimators': 500, 'learning_rate': 0.17464837156594812, 'min_split_gain': 0.016409189340524957, 'max_depth': 4, 'max_cat_threshold': 32, 'cat_l2': 16.12380233904613}. Best is trial 0 with value: 0.6860678544480908.




[I 2023-11-12 07:56:57,600] Trial 6 finished with value: 0.6422242651706542 and parameters: {'n_estimators': 500, 'learning_rate': 0.1190276629921124, 'min_split_gain': 0.0016256939086238154, 'max_depth': 4, 'max_cat_threshold': 64, 'cat_l2': 13.51802012369669}. Best is trial 0 with value: 0.6860678544480908.




[I 2023-11-12 07:57:11,886] Trial 7 finished with value: 0.6368735560335496 and parameters: {'n_estimators': 500, 'learning_rate': 0.18611162489328562, 'min_split_gain': 0.044772872781817685, 'max_depth': 3, 'max_cat_threshold': 128, 'cat_l2': 62.75539869525877}. Best is trial 0 with value: 0.6860678544480908.




[I 2023-11-12 07:57:33,090] Trial 8 finished with value: 0.6796406573930751 and parameters: {'n_estimators': 500, 'learning_rate': 0.16244452548145874, 'min_split_gain': 0.06352146415676417, 'max_depth': 8, 'max_cat_threshold': 16, 'cat_l2': 88.46044162808883}. Best is trial 0 with value: 0.6860678544480908.




[I 2023-11-12 07:57:53,967] Trial 9 finished with value: 0.6856237554901575 and parameters: {'n_estimators': 500, 'learning_rate': 0.23046899300222434, 'min_split_gain': 0.030822390957075587, 'max_depth': 8, 'max_cat_threshold': 32, 'cat_l2': 42.69189033278756}. Best is trial 0 with value: 0.6860678544480908.




[I 2023-11-12 07:58:13,917] Trial 10 finished with value: 0.6764533712159588 and parameters: {'n_estimators': 500, 'learning_rate': 0.2941185203028027, 'min_split_gain': 0.0956452978781506, 'max_depth': 6, 'max_cat_threshold': 8, 'cat_l2': 32.92757509858584}. Best is trial 0 with value: 0.6860678544480908.




[I 2023-11-12 07:58:34,326] Trial 11 finished with value: 0.6888316297879802 and parameters: {'n_estimators': 500, 'learning_rate': 0.23063435869415547, 'min_split_gain': 0.03260632043082122, 'max_depth': 8, 'max_cat_threshold': 32, 'cat_l2': 41.37900059325428}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 07:58:54,680] Trial 12 finished with value: 0.6849626040034172 and parameters: {'n_estimators': 500, 'learning_rate': 0.2926359600884957, 'min_split_gain': 0.028363515546753683, 'max_depth': -1, 'max_cat_threshold': 32, 'cat_l2': 50.711521872268435}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 07:59:14,876] Trial 13 finished with value: 0.6841891128487437 and parameters: {'n_estimators': 500, 'learning_rate': 0.21823064903933306, 'min_split_gain': 0.037897258354798125, 'max_depth': 8, 'max_cat_threshold': 32, 'cat_l2': 27.97267878949862}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 07:59:35,660] Trial 14 finished with value: 0.6860093148275033 and parameters: {'n_estimators': 500, 'learning_rate': 0.20293726140097793, 'min_split_gain': 0.011563838616216182, 'max_depth': 8, 'max_cat_threshold': 16, 'cat_l2': 5.066637859048264}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 07:59:55,749] Trial 15 finished with value: 0.67406560103313 and parameters: {'n_estimators': 500, 'learning_rate': 0.26031449361602577, 'min_split_gain': 0.03395454802980872, 'max_depth': 8, 'max_cat_threshold': 8, 'cat_l2': 72.8854748025808}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:00:16,342] Trial 16 finished with value: 0.686593762814422 and parameters: {'n_estimators': 500, 'learning_rate': 0.20277644951341064, 'min_split_gain': 0.04951630644370222, 'max_depth': -1, 'max_cat_threshold': 32, 'cat_l2': 51.76088143323739}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:00:36,649] Trial 17 finished with value: 0.6859036554002367 and parameters: {'n_estimators': 500, 'learning_rate': 0.2017576994517553, 'min_split_gain': 0.04897375619206084, 'max_depth': -1, 'max_cat_threshold': 32, 'cat_l2': 42.93801489319048}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:00:57,807] Trial 18 finished with value: 0.6828730661077103 and parameters: {'n_estimators': 500, 'learning_rate': 0.1350182486144956, 'min_split_gain': 0.05370974180250212, 'max_depth': -1, 'max_cat_threshold': 32, 'cat_l2': 75.87187171965351}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:01:16,451] Trial 19 finished with value: 0.642383648664115 and parameters: {'n_estimators': 500, 'learning_rate': 0.15551354693140068, 'min_split_gain': 0.06764018645036919, 'max_depth': 5, 'max_cat_threshold': 8, 'cat_l2': 99.83321887202999}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:01:37,824] Trial 20 finished with value: 0.6877750484863788 and parameters: {'n_estimators': 500, 'learning_rate': 0.2030751768275744, 'min_split_gain': 0.04559873397987039, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 26.14741733214893}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:01:59,061] Trial 21 finished with value: 0.6836759352263269 and parameters: {'n_estimators': 500, 'learning_rate': 0.20693302026373703, 'min_split_gain': 0.04452090266177759, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 26.259810951090653}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:02:20,811] Trial 22 finished with value: 0.6847343860640074 and parameters: {'n_estimators': 500, 'learning_rate': 0.1818316623297704, 'min_split_gain': 0.0549356554180636, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 47.2785787872373}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:02:43,063] Trial 23 finished with value: 0.6826350930085188 and parameters: {'n_estimators': 500, 'learning_rate': 0.21908610926756217, 'min_split_gain': 0.037766315670612394, 'max_depth': 6, 'max_cat_threshold': 128, 'cat_l2': 35.4200114944217}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:03:03,716] Trial 24 finished with value: 0.685886055876051 and parameters: {'n_estimators': 500, 'learning_rate': 0.26899697785342425, 'min_split_gain': 0.023921686564139913, 'max_depth': 10, 'max_cat_threshold': 16, 'cat_l2': 25.752726082095936}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:03:25,097] Trial 25 finished with value: 0.6853720170173592 and parameters: {'n_estimators': 500, 'learning_rate': 0.22290156224298688, 'min_split_gain': 0.048254211395048166, 'max_depth': -1, 'max_cat_threshold': 128, 'cat_l2': 43.945484460564685}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:03:45,323] Trial 26 finished with value: 0.6866398622787038 and parameters: {'n_estimators': 500, 'learning_rate': 0.19221219615912033, 'min_split_gain': 0.03816178636930292, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 56.66777910677314}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:04:06,236] Trial 27 finished with value: 0.6850527950692663 and parameters: {'n_estimators': 500, 'learning_rate': 0.1796900247237887, 'min_split_gain': 0.03538123957254733, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 37.06452954049576}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:04:27,976] Trial 28 finished with value: 0.6883363341473114 and parameters: {'n_estimators': 500, 'learning_rate': 0.2668739132273037, 'min_split_gain': 0.024860349482894346, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 57.86025358121932}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:04:49,700] Trial 29 finished with value: 0.6850963456979773 and parameters: {'n_estimators': 500, 'learning_rate': 0.2715328051527559, 'min_split_gain': 0.026638076293729412, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 45.50685729096079}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:05:11,523] Trial 30 finished with value: 0.6849429259414302 and parameters: {'n_estimators': 500, 'learning_rate': 0.23351746772294044, 'min_split_gain': 0.022321699251720773, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 31.12953399243125}. Best is trial 11 with value: 0.6888316297879802.




[I 2023-11-12 08:05:32,758] Trial 31 finished with value: 0.6908604431032936 and parameters: {'n_estimators': 500, 'learning_rate': 0.24922405732354058, 'min_split_gain': 0.03126564532038549, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 55.69886937210401}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:05:54,125] Trial 32 finished with value: 0.6847464492024867 and parameters: {'n_estimators': 500, 'learning_rate': 0.25182191217578437, 'min_split_gain': 0.0304949725012875, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 54.40352304481588}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:06:14,944] Trial 33 finished with value: 0.6847661775599188 and parameters: {'n_estimators': 500, 'learning_rate': 0.2480624100726809, 'min_split_gain': 0.01565288932844889, 'max_depth': 5, 'max_cat_threshold': 128, 'cat_l2': 57.40972683401317}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:06:35,761] Trial 34 finished with value: 0.686453917660242 and parameters: {'n_estimators': 500, 'learning_rate': 0.27474266717793877, 'min_split_gain': 0.024337668468409443, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 39.45242853221886}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:06:55,883] Trial 35 finished with value: 0.6903773532478027 and parameters: {'n_estimators': 500, 'learning_rate': 0.23522873793241608, 'min_split_gain': 0.03308027298166509, 'max_depth': 10, 'max_cat_threshold': 64, 'cat_l2': 49.30582178324301}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:07:12,225] Trial 36 finished with value: 0.6663331004449585 and parameters: {'n_estimators': 500, 'learning_rate': 0.24397406977727487, 'min_split_gain': 0.03306961481297351, 'max_depth': 4, 'max_cat_threshold': 64, 'cat_l2': 49.86513637883123}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:07:25,581] Trial 37 finished with value: 0.6372290030065869 and parameters: {'n_estimators': 500, 'learning_rate': 0.23578214480494486, 'min_split_gain': 0.018447140999849045, 'max_depth': 3, 'max_cat_threshold': 64, 'cat_l2': 61.941990284141404}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:07:45,657] Trial 38 finished with value: 0.6854224250422934 and parameters: {'n_estimators': 500, 'learning_rate': 0.27863343227517356, 'min_split_gain': 0.0405731757133321, 'max_depth': 6, 'max_cat_threshold': 64, 'cat_l2': 48.80348010503195}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:08:05,581] Trial 39 finished with value: 0.6809370248370659 and parameters: {'n_estimators': 500, 'learning_rate': 0.2535932361319173, 'min_split_gain': 0.010524342503326432, 'max_depth': 10, 'max_cat_threshold': 64, 'cat_l2': 57.52026771018552}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:08:25,437] Trial 40 finished with value: 0.6861363901837978 and parameters: {'n_estimators': 500, 'learning_rate': 0.28468040621751023, 'min_split_gain': 0.028764077937874104, 'max_depth': 8, 'max_cat_threshold': 64, 'cat_l2': 64.81936510731106}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:08:46,284] Trial 41 finished with value: 0.6849564924099454 and parameters: {'n_estimators': 500, 'learning_rate': 0.2630018297129145, 'min_split_gain': 0.042713330381671785, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 38.77774417877672}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:09:07,833] Trial 42 finished with value: 0.6865713264703474 and parameters: {'n_estimators': 500, 'learning_rate': 0.23366670391613745, 'min_split_gain': 0.034296996394270474, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 51.515775794964824}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:09:25,436] Trial 43 finished with value: 0.6739807879727541 and parameters: {'n_estimators': 500, 'learning_rate': 0.29937087400462914, 'min_split_gain': 0.02650071865976559, 'max_depth': 4, 'max_cat_threshold': 128, 'cat_l2': 42.04013470764524}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:09:46,038] Trial 44 finished with value: 0.6853424558141417 and parameters: {'n_estimators': 500, 'learning_rate': 0.258669642450316, 'min_split_gain': 0.04225927472066242, 'max_depth': 10, 'max_cat_threshold': 16, 'cat_l2': 46.97873205933173}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:09:59,100] Trial 45 finished with value: 0.5964419383660295 and parameters: {'n_estimators': 500, 'learning_rate': 0.22005626360630157, 'min_split_gain': 0.021634375337918357, 'max_depth': 3, 'max_cat_threshold': 8, 'cat_l2': 60.53187757356105}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:10:18,731] Trial 46 finished with value: 0.6842572693265572 and parameters: {'n_estimators': 500, 'learning_rate': 0.24415306709113882, 'min_split_gain': 0.03033694971000647, 'max_depth': 5, 'max_cat_threshold': 64, 'cat_l2': 66.98535730841117}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:10:40,096] Trial 47 finished with value: 0.6837970401013098 and parameters: {'n_estimators': 500, 'learning_rate': 0.28309599963649246, 'min_split_gain': 0.03746490295254025, 'max_depth': 8, 'max_cat_threshold': 128, 'cat_l2': 54.08378351841454}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:11:01,136] Trial 48 finished with value: 0.6848375045392261 and parameters: {'n_estimators': 500, 'learning_rate': 0.23325467607171008, 'min_split_gain': 0.04554216295482416, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 31.64065838786004}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:11:21,758] Trial 49 finished with value: 0.6831696465176165 and parameters: {'n_estimators': 500, 'learning_rate': 0.21142650000055518, 'min_split_gain': 0.03160404502326519, 'max_depth': 8, 'max_cat_threshold': 64, 'cat_l2': 23.143621119195824}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:11:43,547] Trial 50 finished with value: 0.6822607641170617 and parameters: {'n_estimators': 500, 'learning_rate': 0.16730832795858858, 'min_split_gain': 0.05243885783092306, 'max_depth': 10, 'max_cat_threshold': 16, 'cat_l2': 41.50867957028911}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:12:03,931] Trial 51 finished with value: 0.6870804312771306 and parameters: {'n_estimators': 500, 'learning_rate': 0.1965417624180153, 'min_split_gain': 0.03928705091633662, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 54.176400173285195}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:12:23,832] Trial 52 finished with value: 0.6835125859160193 and parameters: {'n_estimators': 500, 'learning_rate': 0.1900065376707153, 'min_split_gain': 0.04010570373263653, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 52.033199169494395}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:12:43,469] Trial 53 finished with value: 0.6890083267964284 and parameters: {'n_estimators': 500, 'learning_rate': 0.22609850714123206, 'min_split_gain': 0.035016698890970636, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 47.3158103097262}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:13:03,152] Trial 54 finished with value: 0.6872062260071989 and parameters: {'n_estimators': 500, 'learning_rate': 0.22452416149779536, 'min_split_gain': 0.03455535951854757, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 47.1980278431471}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:13:22,871] Trial 55 finished with value: 0.6850754290068245 and parameters: {'n_estimators': 500, 'learning_rate': 0.21303614326123163, 'min_split_gain': 0.026595222755241066, 'max_depth': 6, 'max_cat_threshold': 32, 'cat_l2': 34.07444635735972}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:13:42,549] Trial 56 finished with value: 0.6748240719106449 and parameters: {'n_estimators': 500, 'learning_rate': 0.2652735061860572, 'min_split_gain': 0.046996165711338196, 'max_depth': 8, 'max_cat_threshold': 8, 'cat_l2': 44.81699819463889}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:13:58,216] Trial 57 finished with value: 0.6596890093709031 and parameters: {'n_estimators': 500, 'learning_rate': 0.24574007324799108, 'min_split_gain': 0.030448216767598985, 'max_depth': 4, 'max_cat_threshold': 32, 'cat_l2': 38.48381521242359}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:14:19,508] Trial 58 finished with value: 0.6870879651684995 and parameters: {'n_estimators': 500, 'learning_rate': 0.2552372109978243, 'min_split_gain': 0.035611754153855875, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 49.18209189309184}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:14:32,597] Trial 59 finished with value: 0.6246958621354206 and parameters: {'n_estimators': 500, 'learning_rate': 0.21169794470823253, 'min_split_gain': 0.044813256928213696, 'max_depth': 3, 'max_cat_threshold': 32, 'cat_l2': 59.73437816558882}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:14:54,063] Trial 60 finished with value: 0.6838698695179044 and parameters: {'n_estimators': 500, 'learning_rate': 0.22576546461823302, 'min_split_gain': 0.019466482706143963, 'max_depth': 10, 'max_cat_threshold': 128, 'cat_l2': 42.737716775235825}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:15:14,772] Trial 61 finished with value: 0.6890726944371459 and parameters: {'n_estimators': 500, 'learning_rate': 0.2254252911931832, 'min_split_gain': 0.033889540314170116, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 47.80815785855845}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:15:34,802] Trial 62 finished with value: 0.6875382221867425 and parameters: {'n_estimators': 500, 'learning_rate': 0.23949808302033038, 'min_split_gain': 0.03283951796337265, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 45.43143860168316}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:15:55,164] Trial 63 finished with value: 0.68777687575231 and parameters: {'n_estimators': 500, 'learning_rate': 0.22796153439126915, 'min_split_gain': 0.04155398765340046, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 55.507373601141}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:16:15,423] Trial 64 finished with value: 0.6884398910756395 and parameters: {'n_estimators': 500, 'learning_rate': 0.26270849426482606, 'min_split_gain': 0.028289323238869363, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 55.84510304025694}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:16:34,953] Trial 65 finished with value: 0.6831991903238717 and parameters: {'n_estimators': 500, 'learning_rate': 0.2652615636413784, 'min_split_gain': 0.02778458463458888, 'max_depth': 5, 'max_cat_threshold': 32, 'cat_l2': 51.92029926018643}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:16:55,440] Trial 66 finished with value: 0.6691687010471652 and parameters: {'n_estimators': 500, 'learning_rate': 0.2873595331546329, 'min_split_gain': 0.02421493309308701, 'max_depth': -1, 'max_cat_threshold': 32, 'cat_l2': 58.40866743025603}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:17:15,516] Trial 67 finished with value: 0.689001962538061 and parameters: {'n_estimators': 500, 'learning_rate': 0.24982572609502535, 'min_split_gain': 0.028390983718812837, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 49.428529443848134}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:17:35,837] Trial 68 finished with value: 0.6882819831066217 and parameters: {'n_estimators': 500, 'learning_rate': 0.2511841039468706, 'min_split_gain': 0.03670654256701842, 'max_depth': 8, 'max_cat_threshold': 32, 'cat_l2': 48.93797049374438}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:17:55,940] Trial 69 finished with value: 0.6900588543139661 and parameters: {'n_estimators': 500, 'learning_rate': 0.2399922188126666, 'min_split_gain': 0.028885630081828897, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 40.90237389807658}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:18:15,893] Trial 70 finished with value: 0.6860018214576069 and parameters: {'n_estimators': 500, 'learning_rate': 0.24257404117723202, 'min_split_gain': 0.03011226072914755, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 35.58907701211593}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:18:35,692] Trial 71 finished with value: 0.6882547226083615 and parameters: {'n_estimators': 500, 'learning_rate': 0.25773113186662594, 'min_split_gain': 0.03309474796663936, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 46.330390941101314}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:18:55,296] Trial 72 finished with value: 0.6873509434473166 and parameters: {'n_estimators': 500, 'learning_rate': 0.27403639116951295, 'min_split_gain': 0.02799631731259533, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 40.21887889296812}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:19:14,943] Trial 73 finished with value: 0.6871444347774278 and parameters: {'n_estimators': 500, 'learning_rate': 0.23806752764245073, 'min_split_gain': 0.021224397184882008, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 43.934588310375446}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:19:34,537] Trial 74 finished with value: 0.6887672140015644 and parameters: {'n_estimators': 500, 'learning_rate': 0.21892128848636672, 'min_split_gain': 0.03801286735873887, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 54.4609261815213}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:19:54,448] Trial 75 finished with value: 0.6866697593777125 and parameters: {'n_estimators': 500, 'learning_rate': 0.22987966414697378, 'min_split_gain': 0.03799102676781065, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 49.77830324712888}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:20:14,304] Trial 76 finished with value: 0.685071573765684 and parameters: {'n_estimators': 500, 'learning_rate': 0.21908875892284543, 'min_split_gain': 0.035317972374839496, 'max_depth': 6, 'max_cat_threshold': 32, 'cat_l2': 52.15649981666092}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:20:34,620] Trial 77 finished with value: 0.6870646289451706 and parameters: {'n_estimators': 500, 'learning_rate': 0.2011011320026631, 'min_split_gain': 0.0327480885017443, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 40.79931283864566}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:20:55,307] Trial 78 finished with value: 0.6727022345418903 and parameters: {'n_estimators': 500, 'learning_rate': 0.21424566299721892, 'min_split_gain': 0.04011040562785125, 'max_depth': 8, 'max_cat_threshold': 8, 'cat_l2': 47.7758009638346}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:21:16,396] Trial 79 finished with value: 0.6851011539545164 and parameters: {'n_estimators': 500, 'learning_rate': 0.2497142460869285, 'min_split_gain': 0.03647413965834458, 'max_depth': -1, 'max_cat_threshold': 16, 'cat_l2': 36.55047947728208}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:21:32,265] Trial 80 finished with value: 0.6660251246882829 and parameters: {'n_estimators': 500, 'learning_rate': 0.23761502391676564, 'min_split_gain': 0.04315390970424694, 'max_depth': 4, 'max_cat_threshold': 64, 'cat_l2': 43.80775100403931}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:21:52,551] Trial 81 finished with value: 0.6872810042687784 and parameters: {'n_estimators': 500, 'learning_rate': 0.2291409655820343, 'min_split_gain': 0.02860359049807236, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 55.07193464842077}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:22:12,626] Trial 82 finished with value: 0.6833095638262108 and parameters: {'n_estimators': 500, 'learning_rate': 0.2587821870656838, 'min_split_gain': 0.025457271353127902, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 54.03121056407595}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:22:32,213] Trial 83 finished with value: 0.6858704511218432 and parameters: {'n_estimators': 500, 'learning_rate': 0.24529907600948683, 'min_split_gain': 0.03138935156517045, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 50.334511769826}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:22:51,892] Trial 84 finished with value: 0.6890096739824143 and parameters: {'n_estimators': 500, 'learning_rate': 0.2712083045394424, 'min_split_gain': 0.028802642055411463, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 56.821912025413084}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:23:11,359] Trial 85 finished with value: 0.6850388174782395 and parameters: {'n_estimators': 500, 'learning_rate': 0.22249693558832812, 'min_split_gain': 0.023928406853461308, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 45.46155257300546}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:23:24,354] Trial 86 finished with value: 0.6252850090710655 and parameters: {'n_estimators': 500, 'learning_rate': 0.206363334952048, 'min_split_gain': 0.0340967331014372, 'max_depth': 3, 'max_cat_threshold': 32, 'cat_l2': 47.899955478089836}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:23:44,352] Trial 87 finished with value: 0.6843998569078256 and parameters: {'n_estimators': 500, 'learning_rate': 0.27709193848636127, 'min_split_gain': 0.031282583776497515, 'max_depth': 10, 'max_cat_threshold': 64, 'cat_l2': 52.907085080155696}. Best is trial 31 with value: 0.6908604431032936.




[I 2023-11-12 08:24:03,182] Trial 88 finished with value: 0.6783074742883776 and parameters: {'n_estimators': 500, 'learning_rate': 0.23778628667105395, 'min_split_gain': 0.03863277641214161, 'max_depth': 5, 'max_cat_threshold': 32, 'cat_l2': 60.726914347843845}. Best is trial 31 with value: 0.6908604431032936.




[W 2023-11-12 08:24:15,286] Trial 89 failed with parameters: {'n_estimators': 500, 'learning_rate': 0.27093410663966183, 'min_split_gain': 0.035938970211483005, 'max_depth': 10, 'max_cat_threshold': 32, 'cat_l2': 40.637649698069765} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Anaconda3\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Егор\AppData\Local\Temp\ipykernel_14824\4251613687.py", line 33, in objective
    for ti, di in StratifiedKFold(n_splits=5, shuffle=True).split(X, y):
  File "C:\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py", line 352, in split
    for train, test in super().split(X, y, groups):
  File "C:\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py", line 87, in split
    test_index = indices[test_index]
KeyboardInterrupt
[W 2023-11-12 08:24:15,309] Trial 89 failed with value None.


KeyboardInterrupt: 

In [268]:
study.best_params

{'n_estimators': 500,
 'learning_rate': 0.1653879638699648,
 'num_leaves': 60,
 'max_depth': 8,
 'max_cat_threshold': 32,
 'cat_l2': 25.2041278553189}

In [310]:
lgbm = LGBMClassifier(is_unbalance=True, num_threads = 8, force_row_wise=True, seed = 77, **{'n_estimators': 500, 
 'learning_rate': 0.13404981239009836, 
 'min_split_gain': 0.03842333239584914, 
 'max_depth': -1, 
 'max_cat_threshold': 32, 
 'cat_l2': 68.60685058667403})

lgbm.fit(X_train, y_train)

In [260]:
y_pred = lgbm.predict(X_test)

In [311]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred)

0.5083333333333333

In [228]:
sorted(list(zip(X_train.columns, lgbm.feature_importances_)), key=lambda x: x[1])[::-1]

[('probeg', 85000),
 ('time_before_srok_sl', 73727),
 ('age', 70465),
 ('under_load_percent', 66132),
 ('last_month_std', 62623),
 ('under_load_dist', 53323),
 ('time_before_remont', 49907),
 ('nearrepair', 41690),
 ('under_load_days', 39959),
 ('last_month_run', 36222),
 ('last_month_mean', 32189),
 ('cur_st_send', 28207),
 ('num_small_repairs', 13870),
 ('cur_st_dest', 10195),
 ('tormoz', 6178),
 ('kod_remont', 5781),
 ('kuzov', 5037),
 ('tara', 3113),
 ('ownertype', 2773),
 ('manage_type', 2388),
 ('rod_id', 1572),
 ('stat_remont', 1437),
 ('kod_vrab_tr_rems', 1173),
 ('model', 1081),
 ('zavod_build', 455),
 ('cnsi_probeg_kr', 204),
 ('tipvozd', 155),
 ('tippogl', 134),
 ('gruz', 10),
 ('norma_km', 0),
 ('telega', 0),
 ('cnsi_probeg_dr', 0),
 ('cnsi_volumek', 0),
 ('cnsi_gruz_capacity', 0)]