<a href="https://colab.research.google.com/github/yoyadima/DA/blob/main/Sber_gender_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
import re
import matplotlib.pyplot as plt
import seaborn as sns

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

from tqdm.notebook import tqdm_notebook
from warnings import filterwarnings

%matplotlib inline
filterwarnings('ignore')

In [5]:
tr_mcc_codes = pd.read_csv('/content/drive/MyDrive/Python_And_Data_Analysis/data/tr_mcc_codes.csv', sep=';', index_col='mcc_code')
tr_types = pd.read_csv('/content/drive/MyDrive/Python_And_Data_Analysis/data/tr_types.csv', sep=';', index_col='tr_type')

transactions = pd.read_csv('/content/drive/MyDrive/Python_And_Data_Analysis/data/transactions.csv', index_col='customer_id')
gender_train = pd.read_csv('/content/drive/MyDrive/Python_And_Data_Analysis/data/gender_train.csv', index_col='customer_id')
gender_test = pd.read_csv('/content/drive/MyDrive/Python_And_Data_Analysis/data/gender_test.csv', index_col='customer_id')
transactions_train = transactions.join(gender_train, how='inner')
transactions_test = transactions.join(gender_test, how='inner')

del transactions

In [6]:
params = {
    'max_depth': 4,
    'alpha': 0.6000000000000001, 'colsample_bytree': 0.66, 'eta': 0.025, 'gamma': 0.5, 'lambda': 0.2, 'min_child_weight': 6.0, 'subsample': 0.55,
    'eval_metric': 'auc',
    'objective': 'binary:logistic' ,
    'booster': 'gbtree',
    'njobs': -1,
    'tree_method': 'approx'
}

In [7]:
male_mcc = [1711, 1799, 2741, 3501, 4411, 5013, 5065, 5072, 5074, 5193, 5199,
       5451, 5511, 5532, 5533, 5542, 5571, 5733, 5734, 5816, 6010, 6051,
       6211, 7338, 7372, 7512, 7538, 7542, 7841, 7993, 7995, 8699, 9399]

In [8]:
tqdm_notebook.pandas(desc="Progress:")

for df in [transactions_train, transactions_test]:
    df['day'] = df['tr_datetime'].str.split().apply(lambda x: int(x[0]) % 7)
    df['hour'] = df['tr_datetime'].apply(lambda x: re.search(' \d*', x).group(0)).astype(int)
    df['night'] = ~df['hour'].between(6, 22).astype(int)
    df['mcc_code_tr_type'] = df['mcc_code'] + df['tr_type']
    df['amount_bucket'] = pd.qcut(df['amount'], q=5, labels=[0, 1, 2, 3, 4])

    df['tr_type+hour'] = df['hour'] + df['tr_type']
    df['mcc_code+hour'] = df['mcc_code'] + df['hour']
    df['spend/replenish'] = df['amount'].apply(lambda x: 0 if x < 0 else 1)
    df['nigth_repl'] = 0
    df.loc[(df['night'] == -1) & (df['spend/replenish'] == 1), 'nigth_repl'] = 1
    df['amount_bucket'] = 0
    df.loc[(df.amount > -41500301.141) & (df.amount <= -44918.32), 'amount_bucket'] = 1
    df.loc[(df.amount > -44918.32) & (df.amount <= -12744.67), 'amount_bucket'] = 2
    df.loc[(df.amount > -12744.67) & (df.amount <= -5794.46), 'amount_bucket'] = 3
    df.loc[(df.amount > -5794.46) & (df.amount <= -2245.92), 'amount_bucket'] = 4
    df.loc[(df.amount > -2245.92) & (df.amount <= -0.0001), 'amount_bucket'] = 5
    df['mcc_gender'] = 0
    df.loc[df.mcc_code.isin(male_mcc), 'mcc_gender'] = 1
    df['weekend'] = 0
    df.loc[(df.day == 6) | (df.day == 5),'weekend'] = 1

In [9]:
###TOP SUB
def features_creation_advanced(x): 
    features = []
    features.append(pd.Series(x['day'].value_counts(normalize=True).add_prefix('day_')))
    features.append(pd.Series(x['hour'].value_counts(normalize=True).add_prefix('hour_')))
    features.append(pd.Series(x['night'].value_counts(normalize=True).add_prefix('night_')))
    features.append(pd.Series(x[x['amount']>0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count'])\
                                                        .add_prefix('positive_transactions_')))
    features.append(pd.Series(x[x['amount']<0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count'])\
                                                        .add_prefix('negative_transactions_')))
    #new features
    
    features.append(pd.Series(x['mcc_code'].value_counts(normalize=True).add_prefix('mcc_code_')))

    for i in x['day'].unique():
      features.append(pd.Series(x[(x['amount']>0 & (x['day'] == i))]['amount'].agg(['sum'])\
                                                        .add_prefix('positive_transactions_day_'+str(i)+'_')))
      features.append(pd.Series(x[(x['amount']<0 & (x['day'] == i))]['amount'].agg(['sum'])\
                                                        .add_prefix('negative_transactions_day_'+str(i)+'_')))
    for i in x['night'].unique():
      features.append(pd.Series(x[(x['amount']>0 & (x['night'] == i))]['amount'].agg(['sum'])\
                                                        .add_prefix('positive_transactions_night_'+str(i)+'_')))
      features.append(pd.Series(x[(x['amount']<0 & (x['night'] == i))]['amount'].agg(['sum'])\
                                                        .add_prefix('negative_transactions_night_'+str(i)+'_')))
    '''for i in x['hour'].unique():
      features.append(pd.Series(x[(x['amount']>0 & (x['hour'] == i))]['amount'].agg(['sum'])\
                                                        .add_prefix('positive_transactions_hour_'+str(i)+'_')))
      features.append(pd.Series(x[(x['amount']<0 & (x['hour'] == i))]['amount'].agg(['sum'])\
                                                        .add_prefix('negative_transactions_hour_'+str(i)+'_')))

    features.append(pd.Series(x['tr_type'].value_counts().add_prefix('tr_')))
    #features.append(pd.Series(x['tr_type+hour'].value_counts().add_prefix('tr_type+hour_')))
    features.append(pd.Series(x['spend/replenish'].value_counts().add_prefix('spend/replenish_')))
    features.append(pd.Series(x['nigth_repl'].value_counts().add_prefix('nigth_repl_')))
    features.append(pd.Series(x['amount_bucket'].value_counts().add_prefix('amount_bucket_')))'''
    
    return pd.concat(features)

In [10]:
data_train = transactions_train.groupby(transactions_train.index)\
                               .progress_apply(features_creation_advanced).unstack(-1)

data_test = transactions_test.groupby(transactions_test.index)\
                             .progress_apply(features_creation_advanced).unstack(-1)


HBox(children=(FloatProgress(value=0.0, description='Progress:', max=8400.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Progress:', max=3600.0, style=ProgressStyle(description_w…




In [13]:
target = data_train.join(gender_train, how='inner')['gender']

In [14]:
data_train.fillna(0, inplace=True)
data_test.fillna(0, inplace=True)

for i in data_test.columns:
  data_test[i]=(data_test[i]-data_test[i].min())/(data_test[i].max()-data_test[i].min())

for i in data_train.columns:
  data_train[i]=(data_train[i]-data_train[i].min())/(data_train[i].max()-data_train[i].min())

In [18]:
def hyperopt_xgb_score(params):
  cv_res=xgb.cv(params, xgb.DMatrix(data_train, target),
                  early_stopping_rounds=10, maximize=True, 
                  num_boost_round=10000, nfold=5, stratified=True)
  index_argmax = cv_res['test-auc-mean'].argmax()
  
  return -cv_res.loc[index_argmax]['test-auc-mean']
 
params_xgb = {
            'eta': hp.quniform('eta', 0.001, 0.1, 0.025),
            'max_depth':  4,
            'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
            'colsample_bytree': hp.quniform('colsample_bytree', 0.01, 1, 0.01),

            'gamma': hp.quniform('gamma', 0.1, 2, 0.05),
            'lambda' : hp.quniform('lambda', 0.001, 2, 0.05),
            'alpha' : hp.quniform('alpha', 0.001, 2, 0.05),
            'min_child_weight': hp.quniform('min_child_weight', 1, 30, 1),

            'eval_metric': 'auc',
            'objective': 'binary:logistic' ,
            'booster': 'gbtree',
            'njobs': -1,
            'tree_method': 'approx'
            }
 
best = fmin(fn=hyperopt_xgb_score, space=params_xgb, algo=tpe.suggest, max_evals=150)
print('best:')
print(best)

100%|██████████| 150/150 [2:48:27<00:00, 67.38s/it, best loss: -0.8832628]
best:
{'alpha': 0.6000000000000001, 'colsample_bytree': 0.22, 'eta': 0.025, 'gamma': 0.6000000000000001, 'lambda': 0.5, 'min_child_weight': 5.0, 'subsample': 0.75}
