# Simple example in one Notebook


## Imports

In [24]:
import pandas as pd
import os
import requests
import numpy as np
import yaml
import logging
from sklearn.metrics import roc_auc_score
from crosspredict.iterator import Iterator
from crosspredict.crossval import CrossLightgbmModel
from crosspredict.target_encoder import CrossTargetEncoder, TargetEncoder
from category_encoders import WOEEncoder
logging.basicConfig(level=logging.INFO)

## Load data

In [11]:
file_url = 'https://boosters.pro/api/ch/files/pub/onetwotrip_challenge_train.csv'
file_path = '../tests/onetwotrip_challenge_train.csv'
if os.path.isfile(file_path)!=True:
    myfile = requests.get(file_url)
    open(file_path, 'wb').write(myfile.content)
    
df = pd.read_csv('../tests/onetwotrip_challenge_train.csv')


unique_clients = pd.Series(df['userid'].unique())
test_users = unique_clients.sample(frac=0.2,random_state=0)
val_idx = df['userid'].isin(test_users)
test = df[val_idx].copy()
train = df[~val_idx].copy()

## Example

In [16]:
iter_df = Iterator(n_repeats=3,
                    n_splits=10,
                    random_state = 0,
                    col_client = 'userid',
                    cv_byclient=True)

cross_encoder = CrossTargetEncoder(iterator = iter_df,
                                    encoder_class=WOEEncoder,
                                    n_splits= 5,
                                    n_repeats= 3,
                                    random_state= 0,
                                    col_client= 'userid',
                                    cv_byclient= True,
                                    col_encoded= 'goal1',
                                    cols= ['field3','field2','field11','field23','field18','field20']
                                  )
cross_encoder.fit(train)

In [21]:
feature_name = df.columns.values
feature_name = np.delete(feature_name, np.argwhere(feature_name == 'goal1'))
feature_name = np.delete(feature_name, np.argwhere(feature_name == 'orderid'))
feature_name = np.delete(feature_name, np.argwhere(feature_name == 'userid'))

feature_name = np.append(feature_name,cross_encoder._targetencoded_cols)
params= {'bagging_fraction': 0.849285747554019,
  'bagging_freq': 5,
  'bagging_seed': 0,
  'boosting_type': 'gbdt',
  'data_random_seed': 0,
  'drop_seed': 0,
  'feature_fraction': 0.8212766928844304,
  'feature_fraction_seed': 0,
  'lambda_l1': 0.8955546599539566,
  'lambda_l2': 1.4423261095989717,
  'learning_rate': 0.03,
  'max_bin': 255,
  'max_depth': 43,
  'metric': 'auc',
  'min_data_in_leaf': 149,
  'min_sum_hessian_in_leaf': 1.804477623298885,
  'num_leaves': 363,
  'objective': 'binary',
  'seed': 0,
  'verbose': -1}

In [22]:
model_class = CrossLightgbmModel(iterator=iter_df, 
                                 feature_name=feature_name,
                                params=params,
                                cols_cat = ['field3', 'field2', 'field11', 'field23', 'field18', 'field20'],
                                num_boost_round = 9999,
                                early_stopping_rounds = 50,
                                valid = True,
                                random_state = 0,
                                col_target = 'goal1',
                                cross_target_encoder = cross_encoder)
result = model_class.fit(train)

INFO:crosspredict.crossval:{'bagging_fraction': 0.849285747554019, 'bagging_freq': 5, 'bagging_seed': 0, 'boosting_type': 'gbdt', 'data_random_seed': 0, 'drop_seed': 0, 'feature_fraction': 0.8212766928844304, 'feature_fraction_seed': 0, 'lambda_l1': 0.8955546599539566, 'lambda_l2': 1.4423261095989717, 'learning_rate': 0.03, 'max_bin': 255, 'max_depth': 43, 'metric': 'auc', 'min_data_in_leaf': 149, 'min_sum_hessian_in_leaf': 1.804477623298885, 'num_leaves': 363, 'objective': 'binary', 'seed': 0, 'verbose': -1}
INFO:crosspredict.crossval:REPEAT FOLDS 0 START
INFO:crosspredict.crossval:	CROSSVALIDATION FOLD 0 ENDS with best ROCAUC = 0.6734054055913427
INFO:crosspredict.crossval:	CROSSVALIDATION FOLD 1 ENDS with best ROCAUC = 0.7088453015540618
INFO:crosspredict.crossval:	CROSSVALIDATION FOLD 2 ENDS with best ROCAUC = 0.7035198128758369
INFO:crosspredict.crossval:	CROSSVALIDATION FOLD 3 ENDS with best ROCAUC = 0.7284390475678703
INFO:crosspredict.crossval:	CROSSVALIDATION FOLD 4 ENDS with 

In [23]:
print(roc_auc_score(test['goal1'],model_class.predict(test)), result)

0.713787816937655 {'loss': -0.6981930628773918, 'status': 'ok', 'std': 0.013964530057458985, 'score_max': 0.6981930628773918, 'scores_all': [0.6734054055913427, 0.7088453015540618, 0.7035198128758369, 0.7284390475678703, 0.6839663850240538, 0.6907931579757912, 0.7155053278923382, 0.7064908613389076, 0.6960678210678211, 0.7255113927997716, 0.6992041198501873, 0.6824499130088217, 0.7127641955039515, 0.7084232166823495, 0.6872578921634346, 0.7109219739038242, 0.7008187692795468, 0.6890644922045386, 0.7187227959518332, 0.6884426696731287, 0.6880461956437558, 0.6990867800660959, 0.7042258124750294, 0.7130793833061525, 0.6823202954509903, 0.6939032969980355, 0.6922452990801009, 0.7185549977852164, 0.7021047835248356, 0.7129128739533513], 'num_boost': 51}
