# [Boosters] Raiffeisen Data Cup. Baseline
Общий подход:
- Добавляем к каждой транзакции столбец: is_work (если транзакция находится в пределах 0.02 от дома клиента)
- Добавляем к каждой транзакции столбец: is_home (если транзакция находится в пределах 0.02 от работы клиента)
- Обучаем классификатор предсказывающий вероятность (is_home == 1) для транзакции
- Обучаем классификатор предсказывающий вероятность (is_work == 1) для транзакции

Точность определения местоположения:
- для классификатора is_home: ~3x%
- для классификатора is_work: ~2x%
- общая оценка на Public Leaderboard: ???

Примечание
* Требуется Python версии 3.5
* Требуется библиотека xgboost (для обучения использовалась xgboost версии 0.7.post3)
* Требуются файлы: test_set.csv, train_set.csv в одном каталоге с данным скриптом
* Требования к памяти: должно работать с 2Гб свободного RAM
* Время работы: ~3 минуты (тестировалось на процессоре Intel Core i7-4770)

In [5]:
%load_ext autoreload
%autoreload 2

import sys
MODULES_PATH = '../code/'
if MODULES_PATH not in sys.path:
    sys.path.append(MODULES_PATH)
import mfuncs
    
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
pd.options.display.max_columns = 1000

import lightgbm as lgb


from sklearn.neighbors import NearestNeighbors
%pylab inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Populating the interactive namespace from numpy and matplotlib


In [6]:
# Определим типы колонок для экономии памяти
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_adress_lat': np.float32,
    'pos_adress_lon': np.float32,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
}

# для экономии памяти будем загружать только часть атрибутов транзакций
usecols_train = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_adress_lat', 'pos_adress_lon', 'atm_address_lat', 'atm_address_lon','home_add_lat','home_add_lon','work_add_lat','work_add_lon']
usecols_test = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_address_lat', 'pos_address_lon', 'atm_address_lat', 'atm_address_lon']

## Читаем train_set, test_set, соединяем в один датасет

In [11]:
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_adress_lat': np.float32,
    'pos_adress_lon': np.float32,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
}

rnm = {
    'atm_address_lat': 'atm_lat',
    'atm_address_lon': 'atm_lon',
    'pos_adress_lat': 'pos_lat',
    'pos_adress_lon': 'pos_lon',
    'home_add_lat': 'home_lat',
    'home_add_lon': 'home_lon',
    'work_add_lat': 'work_lat',
    'work_add_lon': 'work_lon',
}

In [12]:
df_train = pd.read_csv('../data/train_set.csv', dtype=dtypes)
df_test = pd.read_csv('../data/test_set.csv', dtype=dtypes)

df_train.rename(columns=rnm, inplace=True)
df_test.rename(columns=rnm, inplace=True)

# соединяем test/train в одном DataFrame
df_train['is_train'] = np.int32(1)
df_test['is_train'] = np.int32(0)
df_all = pd.concat([df_train, df_test])

del df_train, df_test

### Обрабатываем дату транзакции и категориальные признаки

In [14]:
df_all['currency'] = df_all['currency'].fillna(-1).astype(np.int32)
df_all['mcc'] = df_all['mcc'].apply(lambda x: int(x.replace(',', ''))).astype(np.int32)
df_all['city'] = df_all['city'].factorize()[0].astype(np.int32)
df_all['country'] = df_all['country'].factorize()[0].astype(np.int32)

# удаляем транзакции без даты
df_all = df_all[~df_all['transaction_date'].isnull()]
df_all['transaction_date'] =  pd.to_datetime(df_all['transaction_date'], format='%Y-%m-%d')

### Фичи для даты

In [16]:
df_all['month'] = df_all.transaction_date.dt.month
df_all['day'] = df_all.transaction_date.dt.day
df_all['dayofyear'] = df_all.transaction_date.dt.dayofyear
df_all['dayofweek'] = df_all.transaction_date.dt.dayofweek

### Приводим адрес транзакции для pos и atm-транзакций к единообразному виду
Просто объединяем в одну колонку и добавляем фичу - это атм или пос

In [17]:
df_all['is_atm'] = (~df_all['atm_lat'].isnull()).astype(np.int8)
df_all['is_pos'] = (~df_all['pos_lat'].isnull()).astype(np.int8)

df_all['add_lat'] = df_all['atm_lat'].fillna(0) + df_all['pos_lat'].fillna(0)
df_all['add_lon'] = df_all['atm_lon'].fillna(0) + df_all['pos_lon'].fillna(0)

df_all.drop(['atm_lat','atm_lon','pos_lat','pos_lon'], axis=1, inplace=True)

df_all = df_all[~((df_all['add_lon'] == 0) & (df_all['add_lon'] == 0))]

### Генерируем признаки is_home, is_work
TODO: удалить чуваков у которых несколько домов

In [18]:
lat = df_all['home_lat'] - df_all['add_lat']
lon = df_all['home_lon'] - df_all['add_lon']

df_all['is_home'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int8)
df_all['has_home'] = (~df_all['home_lon'].isnull()).astype(np.int8)

lat = df_all['work_lat'] - df_all['add_lat']
lon = df_all['work_lon'] - df_all['add_lon']
df_all['is_work'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int8)
df_all['has_work'] = (~df_all['work_lon'].isnull()).astype(np.int8)

df_all.drop(['work_lat','work_lon','home_lat','home_lon'], axis=1, inplace=True)

### Генерируем категориальный признак для адреса

In [21]:
df_all['address'] = df_all['add_lat'].apply(lambda x: "%.02f" % x) + ';' + df_all['add_lon'].apply(lambda x: "%.02f" % x)
df_all['address'] = df_all['address'].factorize()[0].astype(np.int32)

### Генерируем несколько абонентских фич

In [23]:
# количество транзакций каждого клиента
df_all = df_all.merge(df_all.groupby('customer_id')['amount'].count().reset_index(name='cid_trans_count'), how='left')
df_all['cid_trans_count'] = df_all['cid_trans_count'].astype(np.int32)

df_all = df_all.merge(df_all.groupby(['customer_id','address'])['amount'].count().reset_index(name='cid_add_trans_count'), 
                      how='left')
df_all['cid_add_trans_count'] = df_all['cid_add_trans_count'].astype(np.int32)

# какая часть транзакций клиента приходится на данный адрес
# TODO: БОЛЬШЕ ТАКИХ ФИЧ
df_all['ratio1'] = df_all['cid_add_trans_count'] / df_all['cid_trans_count']

## Вспомогательные функции для оценки точности классификатора

# LightGBM

In [100]:
df_all.head()

Unnamed: 0,amount,atm_address,city,country,currency,customer_id,is_train,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,month,day,dayofyear,dayofweek,is_atm,is_pos,add_lat,add_lon,is_home,has_home,is_work,has_work,address,cid_trans_count,cid_add_trans_count,ratio1
0,2.884034,,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,,,,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,7,15,196,5,0,1,59.844074,30.179153,0,1,1,1,0,70,23,0.328571
1,2.775633,,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,,,,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,10,27,300,4,0,1,59.844074,30.179153,0,1,1,1,0,70,23,0.328571
2,3.708368,,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",,,df06c1fcd3718a514535ae822785f716,2017-10-03,10,3,276,1,0,1,59.8582,30.229023,1,1,0,1,1,70,10,0.142857
3,2.787498,,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,,,,6c5e5793ebc984fb72875feffff62854,2017-09-09,9,9,252,5,0,1,59.844074,30.179153,0,1,1,1,0,70,23,0.328571
4,2.89251,,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,,,,0576445d74e374c92c0902e612fca356,2017-07-06,7,6,187,3,0,1,59.844074,30.179153,0,1,1,1,0,70,23,0.328571


In [132]:
from sklearn.model_selection import train_test_split

ys = ['is_home', 'is_work']
drop_cols = ['atm_address', 'customer_id', 'pos_address', 'terminal_id', 'transaction_date', 
             'is_home' ,'has_home', 'is_work', 'has_work', 'is_train']


drop_cols += ['pred:is_home', 'pred:is_work']


y_cols = ['is_home', 'is_work']
cust_train = df_all[df_all['is_train']==1].groupby('customer_id')[y_col.replace('is_','has_')].max()
cust_train = cust_train[cust_train > 0].index

cust_train, cust_valid = train_test_split(cust_train, test_size=0.2, shuffle=True, random_state=111)

# df_train = df_all[df_all.customer_id.isin(cust_train)]
# df_valid = df_all[df_all.customer_id.isin(cust_valid)]

df_train = pd.DataFrame(cust_train, columns = ['customer_id']).merge(df_all, how='left')
df_valid = pd.DataFrame(cust_valid, columns = ['customer_id']).merge(df_all, how='left')

usecols = df_train.drop(drop_cols, 1, errors='ignore').columns

In [134]:
params = {
    'objective': 'binary',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'metric' : 'binary_logloss',
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'num_threads': 12,
    'verbose': 0,
}

model = {}

In [149]:
y_col = 'is_home'

lgb_train = lgb.Dataset(df_train[usecols], df_train[y_col])
lgb_valid = lgb.Dataset(df_valid[usecols], df_valid[y_col])

gbm_h = lgb.train(params,
                lgb_train,
                valid_sets=[lgb_valid],
                num_boost_round=2000,
                verbose_eval=30,
                early_stopping_rounds=300)

model[y_col] = gbm_h

Training until validation scores don't improve for 300 rounds.
[30]	valid_0's binary_logloss: 0.485937
[60]	valid_0's binary_logloss: 0.459724
[90]	valid_0's binary_logloss: 0.4526
[120]	valid_0's binary_logloss: 0.451434
[150]	valid_0's binary_logloss: 0.452001
[180]	valid_0's binary_logloss: 0.45209
[210]	valid_0's binary_logloss: 0.451627
[240]	valid_0's binary_logloss: 0.452041
[270]	valid_0's binary_logloss: 0.451442
[300]	valid_0's binary_logloss: 0.452191
[330]	valid_0's binary_logloss: 0.451919
[360]	valid_0's binary_logloss: 0.452594
[390]	valid_0's binary_logloss: 0.452038
[420]	valid_0's binary_logloss: 0.452356
[450]	valid_0's binary_logloss: 0.452787
[480]	valid_0's binary_logloss: 0.452926
Early stopping, best iteration is:
[200]	valid_0's binary_logloss: 0.450945


In [150]:
y_col = 'is_work'

lgb_train = lgb.Dataset(df_train[usecols], df_train[y_col])
lgb_valid = lgb.Dataset(df_valid[usecols], df_valid[y_col])

gbm_w = lgb.train(params,
                lgb_train,
                valid_sets=[lgb_valid],
                num_boost_round=2000,
                verbose_eval=30,
                early_stopping_rounds=300)

model[y_col] = gbm_w

Training until validation scores don't improve for 300 rounds.
[30]	valid_0's binary_logloss: 0.330132
[60]	valid_0's binary_logloss: 0.285479
[90]	valid_0's binary_logloss: 0.277591
[120]	valid_0's binary_logloss: 0.275894
[150]	valid_0's binary_logloss: 0.276834
[180]	valid_0's binary_logloss: 0.277931
[210]	valid_0's binary_logloss: 0.278973
[240]	valid_0's binary_logloss: 0.280465
[270]	valid_0's binary_logloss: 0.281767
[300]	valid_0's binary_logloss: 0.283068
[330]	valid_0's binary_logloss: 0.283879
[360]	valid_0's binary_logloss: 0.286295
[390]	valid_0's binary_logloss: 0.287128
Early stopping, best iteration is:
[108]	valid_0's binary_logloss: 0.275737


In [167]:
def _best(x):
    ret = None
    for col in ys:
        pred = ('pred:%s' % col)
        if pred in x:
            i = (x[pred].idxmax())
            cols = [pred, 'add_lat', 'add_lon']
            if col in x:
                cols.append(col)
            tmp = x.loc[i,cols]
            tmp.rename({
                'add_lat':'%s:add_lat' % col,
                'add_lon':'%s:add_lon' % col,
            }, inplace = True)
            if ret is None:
                ret = tmp
            else:
                ret = pd.concat([ret, tmp])
    return ret

def predict_proba(dt, ys=['is_home', 'is_work']):
    for col in ys:
        pred = ('pred:%s' % col)
        dt[pred] = model[col].predict(dt[usecols])
    return dt.groupby('customer_id').apply(_best).reset_index()

def score(dt, ys=['is_home', 'is_work']):
    dt_ret = predict_proba(dt, ys)
    mean = 0.0
    for col in ys:
        col_mean = dt_ret[col].mean()
        mean += col_mean
    if len(ys) == 2:
        mean = mean / len(ys)
    return mean

In [163]:
print ("Train accuracy:", score(df_train, ys=['is_home']))
print ("Test accuracy:", score(df_valid, ys=['is_home']))

print ("Train accuracy:", score(df_train, ys=['is_work']))
print ("Test accuracy:", score(df_valid, ys=['is_work']))

Train accuracy: 0.440375
Test accuracy: 0.396
Train accuracy: 0.16875
Test accuracy: 0.1495


# Predict

In [168]:
cust_test = df_all[df_all['is_train'] == 0]['customer_id'].unique()
df_test = pd.DataFrame(cust_test, columns = ['customer_id']).merge(df_all, how = 'left')
df_test = predict_proba(df_test)
df_test.rename(columns = {
        'customer_id':'_ID_',
        'is_home:add_lat': '_HOME_LAT_',
        'is_home:add_lon': '_HOME_LON_',
        'is_work:add_lat': '_WORK_LAT_',
        'is_work:add_lon': '_WORK_LON_'}, inplace = True)
df_test = df_test[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]

df_test.head()

Unnamed: 0,_ID_,_WORK_LAT_,_WORK_LON_,_HOME_LAT_,_HOME_LON_
0,00021683ccb416637fe9a4cd35e4606e,55.026001,82.915001,55.027,82.917
1,0002d0f8a642272b41c292c12ab6e602,44.034,42.835999,44.032001,42.837002
2,0004d182d9fede3ba2534b2d5e5ad27e,43.591,39.724998,43.584999,39.723
3,0008c2445518c9392cb356c5c3db3392,51.526001,46.019001,51.526001,46.019001
4,001611e3ac051a0ec91c88bbd9dbeb5a,56.998001,40.959,57.001999,40.963001


# Формируем submission-файл

In [173]:
# Заполняем пропуски
df_ = pd.read_csv('../data/test_set.csv', dtype=dtypes, usecols=['customer_id'])
submission = pd.DataFrame(df_['customer_id'].unique(), columns=['_ID_'])

submission = submission.merge(df_test, how='left').fillna(0)
# Пишем файл submission
submission.to_csv('../submissions/base_1.csv', index=None)