In [1]:
import re
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer

def _safe_div(a, b):
    return np.where(b != 0, a / b, 0.0)

def entropy_from_counts(cnts: np.ndarray) -> float:
    total = cnts.sum()
    if total <= 0:
        return 0.0
    p = cnts / total
    p = p[p > 0]
    return float(-(p * np.log(p)).sum())

def hhi_from_counts(cnts: np.ndarray) -> float:
    total = cnts.sum()
    if total <= 0:
        return 0.0
    p = cnts / total
    return float((p * p).sum())

def gini(x: np.ndarray) -> float:
    x = np.asarray(x, dtype=np.float64)
    if x.size == 0 or np.all(x == 0):
        return 0.0
    x = np.sort(np.abs(x))
    n = x.size
    cumx = np.cumsum(x)
    return float((n + 1 - 2 * (cumx.sum() / cumx[-1])) / n)

def as_float32(df):
    for c in df.columns:
        if pd.api.types.is_float_dtype(df[c]):
            df[c] = df[c].astype(np.float32)
        elif pd.api.types.is_integer_dtype(df[c]) and c != 'cl_id':
            df[c] = df[c].astype(np.int32)
    return df

_MONTHS = {'JAN':1,'FEB':2,'MAR':3,'APR':4,'MAY':5,'JUN':6,'JUL':7,'AUG':8,'SEP':9,'OCT':10,'NOV':11,'DEC':12}
_trxdt_re = re.compile(r'(?i)^\s*(\d{1,2})([A-Z]{3})(\d{2})(?::(\d{2}))?(?::(\d{2}))?(?::(\d{2}))?\s*$')

def parse_trdatetime(s: str):
    if pd.isna(s):
        return pd.NaT
    s = str(s).strip()
    m = _trxdt_re.match(s)
    if m:
        d, mon, yy, hh, mm, ss = m.groups()
        d = int(d); mon = _MONTHS.get(mon.upper(), 1); yy = int(yy)
        year = 2000 + yy if yy <= 69 else 1900 + yy
        hh = int(hh) if hh else 0; mm = int(mm) if mm else 0; ss = int(ss) if ss else 0
        try:
            return datetime(year, mon, d, hh, mm, ss)
        except ValueError:
            return pd.NaT
    return pd.to_datetime(s, errors='coerce', dayfirst=True, utc=False)

def _days_from_origin(dt_series: pd.Series, origin):
    origin = pd.to_datetime(origin)
    return (pd.to_datetime(dt_series) - origin).dt.days.astype('Int64')

INCOME_CATS = {'DEPOSIT','SALARY','C2C_IN','C2A_IN','P2P_IN','REVERSAL_IN','REFUND','CREDITS_IN'}
EXPENSE_CATS = {'POS','CASH','WD_ATM_ROS','WD_ATM_IN','WD_ATM','C2C_OUT','C2A_OUT','P2P_OUT','COMMISSIONS','FEES'}


class FeatureFactoryRosbank:
    def __init__(
        self,
        topk_mcc: int = 300,
        topk_trxcat: int = 50,
        topk_channel: int = 10,
        tfidf_max_features_mcc: int = 800,
        tfidf_max_features_trx: int = 300,
        use_dense_tfidf: bool = True,
        fixed_origin: str | None = None,
    ):
        self.topk_mcc = topk_mcc
        self.topk_trxcat = topk_trxcat
        self.topk_channel = topk_channel
        self.tfidf_max_features_mcc = tfidf_max_features_mcc
        self.tfidf_max_features_trx = tfidf_max_features_trx
        self.use_dense_tfidf = use_dense_tfidf
        self.fixed_origin = fixed_origin

        self.origin_ = None
        self.channel_map_ = None
        self.trxcat_map_ = None
        self.top_mcc_ = None
        self.top_trx_ = None
        self.top_chan_ = None
        self.tfidf_mcc_ = None
        self.tfidf_trx_ = None

    def _preprocess(self, df: pd.DataFrame, is_fit: bool):
        need_cols = {'cl_id','MCC','channel_type','currency','TRDATETIME','amount','trx_category'}
        missing = need_cols - set(df.columns)
        if missing:
            raise KeyError(f"Отсутствуют колонки: {missing}")

        d = df.copy()

        trdt = d['TRDATETIME'].apply(parse_trdatetime)
        d['trx_ts'] = pd.to_datetime(trdt)
        d = d[~d['trx_ts'].isna()].reset_index(drop=True)

        if is_fit:
            self.origin_ = pd.to_datetime(self.fixed_origin) if self.fixed_origin else pd.to_datetime(d['trx_ts'].min())

        d['trans_date'] = _days_from_origin(d['trx_ts'], self.origin_).fillna(0).astype(np.int32)

        d['cl_id']   = pd.to_numeric(d['cl_id'], errors='coerce').fillna(-1).astype(np.int32)
        d['MCC']     = pd.to_numeric(d['MCC'], errors='coerce').fillna(-1).astype(np.int32)
        d['currency']= pd.to_numeric(d['currency'], errors='coerce').fillna(-1).astype(np.int32)
        d['amount']  = pd.to_numeric(d['amount'], errors='coerce').fillna(0.0).astype(np.float32)

        if is_fit:
            chan_uniques = pd.Series(d['channel_type'].astype(str).fillna('unknown')).unique().tolist()
            trx_uniques  = pd.Series(d['trx_category'].astype(str).fillna('unknown')).unique().tolist()
            self.channel_map_ = {cat: i for i, cat in enumerate(chan_uniques)}
            self.trxcat_map_  = {cat: i for i, cat in enumerate(trx_uniques)}

        d['channel_code'] = pd.Series([self.channel_map_.get(str(x), -1) for x in d['channel_type']]).astype(np.int32)
        d['trxcat_code']  = pd.Series([self.trxcat_map_.get(str(x), -1) for x in d['trx_category']]).astype(np.int32)

        trx_upper = d['trx_category'].astype(str).str.upper()
        d['is_income']  = trx_upper.isin(INCOME_CATS).astype(np.int8)
        d['is_expense'] = trx_upper.isin(EXPENSE_CATS).astype(np.int8)
        d['amount_income']  = d['amount'] * d['is_income']
        d['amount_expense'] = d['amount'] * d['is_expense']

        return d

    def fit(self, df: pd.DataFrame):
        d = self._preprocess(df, is_fit=True)

        self.top_mcc_  = d['MCC'].value_counts().head(self.topk_mcc).index.tolist()
        self.top_trx_  = d['trxcat_code'].value_counts().head(self.topk_trxcat).index.tolist()
        self.top_chan_ = d['channel_code'].value_counts().head(self.topk_channel).index.tolist()

        seq_mcc = (
            d.sort_values(['cl_id','trans_date'])
             .groupby('cl_id')['MCC']
             .apply(lambda s: ' '.join(s.astype(str)))
        )
        self.tfidf_mcc_ = TfidfVectorizer(
            max_features=self.tfidf_max_features_mcc,
            dtype=np.float32,
            token_pattern=r'(?u)\b\w+\b'
        )
        self.tfidf_mcc_.fit(seq_mcc.values)

        seq_trx = (
            d.sort_values(['cl_id','trans_date'])
             .groupby('cl_id')['trxcat_code']
             .apply(lambda s: ' '.join(s.astype(str)))
        )
        self.tfidf_trx_ = TfidfVectorizer(
            max_features=self.tfidf_max_features_trx,
            dtype=np.float32,
            token_pattern=r'(?u)\b\w+\b'
        )
        self.tfidf_trx_.fit(seq_trx.values)

        return self

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        if any(x is None for x in [self.origin_, self.channel_map_, self.trxcat_map_, self.top_mcc_, self.top_trx_, self.top_chan_, self.tfidf_mcc_, self.tfidf_trx_]):
            raise RuntimeError(".fit() firstly")

        d = self._preprocess(df, is_fit=False)

        agg = d.groupby('cl_id').agg(
            tx_cnt=('amount','size'),
            sum_amount=('amount','sum'),
            mean_amount=('amount','mean'),
            std_amount=('amount','std'),
            min_amount=('amount','min'),
            max_amount=('amount','max'),
            q25_amount=('amount', lambda x: np.percentile(x, 25)),
            q50_amount=('amount', lambda x: np.percentile(x, 50)),
            q75_amount=('amount', lambda x: np.percentile(x, 75)),
            sum_income=('amount_income','sum'),
            sum_expense=('amount_expense','sum'),
            cnt_income=('is_income','sum'),
            cnt_expense=('is_expense','sum'),
            first_day=('trans_date','min'),
            last_day=('trans_date','max'),
            active_days=('trans_date','nunique'),
            uniq_mcc=('MCC','nunique'),
            uniq_trxcat=('trxcat_code','nunique'),
            uniq_channel=('channel_code','nunique'),
            uniq_currency=('currency','nunique'),
        ).reset_index()

        agg['tx_cnt'] = agg['tx_cnt'].astype(np.int32)
        agg['period'] = (agg['last_day'] - agg['first_day'] + 1).astype(np.int32)
        agg['tx_per_day'] = _safe_div(agg['tx_cnt'].astype(np.float32), agg['period'].astype(np.float32))
        agg['days_ratio'] = _safe_div(agg['active_days'].astype(np.float32), agg['period'].astype(np.float32))
        agg['income_share']  = _safe_div(agg['sum_income'], agg['sum_amount'] + 1e-9)
        agg['expense_share'] = _safe_div(agg['sum_expense'], agg['sum_amount'] + 1e-9)

        day_agg = (
            d.groupby(['cl_id','trans_date'])
             .agg(day_sum=('amount','sum'),
                  day_cnt=('amount','size'),
                  day_income=('amount_income','sum'),
                  day_expense=('amount_expense','sum'))
             .reset_index()
        )
        daily = day_agg.groupby('cl_id').agg(
            day_sum_mean=('day_sum','mean'),
            day_sum_std =('day_sum','std'),
            day_cnt_mean=('day_cnt','mean'),
            day_cnt_std =('day_cnt','std'),
            day_sum_q90 =('day_sum', lambda x: np.percentile(x, 90)),
            day_income_mean=('day_income','mean'),
            day_expense_mean=('day_expense','mean'),
        ).reset_index()

        def _gap_stats(s):
            v = np.sort(s.values)
            if v.size <= 1:
                return pd.Series(dict(gap_mean=0.0, gap_median=0.0))
            gaps = np.diff(v)
            return pd.Series(dict(gap_mean=float(gaps.mean()), gap_median=float(np.median(gaps))))
        gaps = d.groupby('cl_id')['trans_date'].apply(_gap_stats).reset_index()
        gaps = gaps.pivot(index='cl_id', columns='level_1', values='trans_date').reset_index()

        def _dist_stats(df_g, col, prefix):
            cnt = df_g.groupby(['cl_id', col]).size().rename('cnt').reset_index()
            g = cnt.groupby('cl_id')['cnt']
            stats = pd.DataFrame({
                f'{prefix}_entropy': g.apply(lambda s: entropy_from_counts(s.to_numpy())),
                f'{prefix}_hhi':     g.apply(lambda s: hhi_from_counts(s.to_numpy())),
                f'{prefix}_top1_share': g.apply(lambda s: float(s.max() / s.sum()) if s.sum() > 0 else 0.0),
            }).reset_index()
            return stats

        mcc_stats   = _dist_stats(d, 'MCC', 'mcc')
        trxcat_stats= _dist_stats(d, 'trxcat_code', 'trxcat')
        chan_stats  = _dist_stats(d, 'channel_code', 'chan')

        def _top_pivots_fixed(df_g, col, top_vals, prefix, val_col='amount'):
            mask = df_g[col].isin(top_vals)
            p_cnt = df_g[mask].groupby(['cl_id', col]).size().unstack(fill_value=0)
            p_cnt = p_cnt.reindex(columns=top_vals, fill_value=0).add_prefix(f'{prefix}cnt_').reset_index()

            p_sum = df_g[mask].groupby(['cl_id', col])[val_col].sum().unstack(fill_value=0.0)
            p_sum = p_sum.reindex(columns=top_vals, fill_value=0.0).add_prefix(f'{prefix}sum_').reset_index()

            p_mean = df_g[mask].groupby(['cl_id', col])[val_col].mean().unstack()
            p_mean = p_mean.reindex(columns=top_vals).fillna(0.0).add_prefix(f'{prefix}mean_').reset_index()
            return p_cnt, p_sum, p_mean

        mcc_cnt, mcc_sum, mcc_mean    = _top_pivots_fixed(d, 'MCC',          self.top_mcc_,  'mcc_')
        trx_cnt, trx_sum, trx_mean    = _top_pivots_fixed(d, 'trxcat_code',  self.top_trx_,  'trx_')
        chan_cnt, chan_sum, chan_mean = _top_pivots_fixed(d, 'channel_code', self.top_chan_, 'chan_')

        seq_mcc = (
            d.sort_values(['cl_id','trans_date'])
             .groupby('cl_id')['MCC']
             .apply(lambda s: ' '.join(s.astype(str)))
        )
        X_mcc = self.tfidf_mcc_.transform(seq_mcc.values)
        mcc_cols = [f'tfidf_mcc_{t}' for t in self.tfidf_mcc_.get_feature_names_out()]
        mcc_df = pd.DataFrame(X_mcc.toarray(), columns=mcc_cols)
        mcc_df.insert(0, 'cl_id', seq_mcc.index.values.astype(np.int32))

        seq_trx = (
            d.sort_values(['cl_id','trans_date'])
             .groupby('cl_id')['trxcat_code']
             .apply(lambda s: ' '.join(s.astype(str)))
        )
        X_trx = self.tfidf_trx_.transform(seq_trx.values)
        trx_cols = [f'tfidf_trx_{t}' for t in self.tfidf_trx_.get_feature_names_out()]
        trx_df = pd.DataFrame(X_trx.toarray(), columns=trx_cols)
        trx_df.insert(0, 'cl_id', seq_trx.index.values.astype(np.int32))

        gini_series = (
            d.groupby('cl_id')['amount'].apply(lambda s: gini(s.to_numpy())).astype(np.float32)
        ).rename('amount_gini')

        feats = (agg
                 .merge(daily, on='cl_id', how='left')
                 .merge(gaps, on='cl_id', how='left')
                 .merge(mcc_stats, on='cl_id', how='left')
                 .merge(trxcat_stats, on='cl_id', how='left')
                 .merge(chan_stats, on='cl_id', how='left')
                 .merge(mcc_cnt, on='cl_id', how='left')
                 .merge(mcc_sum, on='cl_id', how='left')
                 .merge(mcc_mean, on='cl_id', how='left')
                 .merge(trx_cnt, on='cl_id', how='left')
                 .merge(trx_sum, on='cl_id', how='left')
                 .merge(trx_mean, on='cl_id', how='left')
                 .merge(chan_cnt, on='cl_id', how='left')
                 .merge(chan_sum, on='cl_id', how='left')
                 .merge(chan_mean, on='cl_id', how='left')
                 .merge(mcc_df, on='cl_id', how='left')
                 .merge(trx_df, on='cl_id', how='left')
                 .merge(gini_series.reset_index(), on='cl_id', how='left'))

        feats = feats.replace([np.inf, -np.inf], 0.0).fillna(0.0)
        feats = as_float32(feats)
        feats.attrs['origin_datetime'] = pd.Timestamp(self.origin_)
        return feats

    def save(self, path: str):
        data = dict(
            origin_=self.origin_,
            channel_map_=self.channel_map_,
            trxcat_map_=self.trxcat_map_,
            top_mcc_=self.top_mcc_,
            top_trx_=self.top_trx_,
            top_chan_=self.top_chan_,
            tfidf_mcc_=self.tfidf_mcc_,
            tfidf_trx_=self.tfidf_trx_,
            params=dict(
                topk_mcc=self.topk_mcc,
                topk_trxcat=self.topk_trxcat,
                topk_channel=self.topk_channel,
                tfidf_max_features_mcc=self.tfidf_max_features_mcc,
                tfidf_max_features_trx=self.tfidf_max_features_trx,
                use_dense_tfidf=self.use_dense_tfidf,
                fixed_origin=self.fixed_origin,
            ),
        )
        with open(path, "wb") as f:
            pickle.dump(data, f)

    @classmethod
    def load(cls, path: str) -> "FeatureFactoryRosbank":
        with open(path, "rb") as f:
            data = pickle.load(f)
        obj = cls(**data["params"])
        obj.origin_ = data["origin_"]
        obj.channel_map_ = data["channel_map_"]
        obj.trxcat_map_ = data["trxcat_map_"]
        obj.top_mcc_ = data["top_mcc_"]
        obj.top_trx_ = data["top_trx_"]
        obj.top_chan_ = data["top_chan_"]
        obj.tfidf_mcc_ = data["tfidf_mcc_"]
        obj.tfidf_trx_ = data["tfidf_trx_"]
        return obj

In [None]:
train = pd.read_csv('/home/jovyan/zoloev-madvillainy/LATTE/data/rosbank/train.csv')
test_ids = pd.read_csv('/home/jovyan/zoloev-madvillainy/LATTE/data/rosbank/test_ids.csv')

train_df = train[~train["cl_id"].isin(set(test_ids["cl_id"]))].reset_index(drop=True)
test_df = train[train["cl_id"].isin(set(test_ids["cl_id"]))].reset_index(drop=True)

ff = FeatureFactoryRosbank(topk_mcc=400, topk_trxcat=80)
ff.fit(train_df)
X_train = ff.transform(train_df.drop(columns=['target_flag', 'target_sum']))
X_test  = ff.transform(test_df.drop(columns=['target_flag', 'target_sum']))

soreva_test = pd.read_csv('/home/jovyan/zoloev-madvillainy/LATTE/data/rosbank/test.csv')
X_soreva_test  = ff.transform(soreva_test)

train_test = pd.concat((X_train, X_test, X_soreva_test))

In [None]:
import lightgbm as lgb

labels = (
    train_df.groupby("cl_id")["target_flag"]
    .max()
    .reset_index()
)

X_train = X_train.merge(labels, on='cl_id')
y_train = X_train['target_flag']

model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
)
model.fit(X_train.drop(columns=['target_flag', 'cl_id']), y_train)

importances = pd.Series(model.feature_importances_, index=X_train.drop(columns=['target_flag', 'cl_id']).columns)
top_features = importances.sort_values(ascending=False).head(128)

In [6]:
Final = train_test[list(top_features.index) + ['cl_id']]

# embs and descr

In [8]:
descriptions = pd.read_parquet('data/rosbank/embeddings/rosbank_base.parquet')
statisctics_and_descriptions = Final.merge(descriptions, on='cl_id')

In [9]:
import lightgbm as lgb

labels = (
    train_df.groupby("cl_id")["target_flag"]
    .max()
    .reset_index()
)

X_train = statisctics_and_descriptions.merge(labels, on='cl_id')
y_train = X_train['target_flag']

model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
)
model.fit(X_train.drop(columns=['target_flag', 'cl_id']), y_train)

importances = pd.Series(model.feature_importances_, index=X_train.drop(columns=['target_flag', 'cl_id']).columns)
top_features = importances.sort_values(ascending=False).head(512)

In [None]:
Final = statisctics_and_descriptions[list(top_features.index) + ['cl_id']]

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

id_col = "cl_id"

ids_test = set(labels[id_col].values)

X_train_df = Final[~Final[id_col].isin(ids_test)].reset_index(drop=True)
X_test_df  = Final[ Final[id_col].isin(ids_test)].reset_index(drop=True)

feat_cols = [c for c in Final.columns if c != id_col]

X_train = X_train_df[feat_cols].astype(np.float32).values
X_test  = X_test_df[feat_cols].astype(np.float32).values

scaler = StandardScaler(with_mean=True, with_std=True)
scaler.fit(X_train)

Z_train = scaler.transform(X_train).astype(np.float32)
Z_test  = scaler.transform(X_test).astype(np.float32)

train_embs = pd.DataFrame({
    id_col: X_train_df[id_col].astype(np.int32).values,
    "embs": list(Z_train),
})
test_embs = pd.DataFrame({
    id_col: X_test_df[id_col].astype(np.int32).values,
    "embs": list(Z_test),
})

scaled_final = pd.concat((train_embs, test_embs))

scaled_final.to_csv(
    'data/rosbank/embeddings/scaled_embs_and_descr.csv',
    index=False
)

In [None]:
latte_s = pd.read_pickle('data/rosbank/latte-s/con_embs_1.pickle')

In [11]:
latte_s['cl_id'] = latte_s['cl_id'].astype(int)
Final['cl_id'] = Final['cl_id'].astype(int)

final_latte_s = Final.merge(latte_s, on='cl_id')

In [12]:
import lightgbm as lgb

labels = (
    train_df.groupby("cl_id")["target_flag"]
    .max()
    .reset_index()
)

X_train = final_latte_s.merge(labels, on='cl_id')
y_train = X_train['target_flag']

model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
)
model.fit(X_train.drop(columns=['target_flag', 'cl_id']), y_train)

importances = pd.Series(model.feature_importances_, index=X_train.drop(columns=['target_flag', 'cl_id']).columns)
top_features = importances.sort_values(ascending=False).head(512)

In [None]:
Final = final_latte_s[list(top_features.index) + ['cl_id']]

---

In [14]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

id_col = "cl_id"

ids_test = set(labels[id_col].values)

X_train_df = Final[~Final[id_col].isin(ids_test)].reset_index(drop=True)
X_test_df  = Final[ Final[id_col].isin(ids_test)].reset_index(drop=True)

feat_cols = [c for c in Final.columns if c != id_col]

X_train = X_train_df[feat_cols].astype(np.float32).values
X_test  = X_test_df[feat_cols].astype(np.float32).values

scaler = StandardScaler(with_mean=True, with_std=True)
scaler.fit(X_train)

Z_train = scaler.transform(X_train).astype(np.float32)
Z_test  = scaler.transform(X_test).astype(np.float32)

train_embs = pd.DataFrame({
    id_col: X_train_df[id_col].astype(np.int32).values,
    "embs": list(Z_train),
})
test_embs = pd.DataFrame({
    id_col: X_test_df[id_col].astype(np.int32).values,
    "embs": list(Z_test),
})

In [15]:
scaled_final = pd.concat((train_embs, test_embs))

In [16]:
scaled_final.to_csv(
    'data/rosbank/embeddings/scaled_final.csv',
    index=False
)