In [1]:
import pandas as pd
import numpy as np
import datetime

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
import sklearn

from sklearn.model_selection import train_test_split

In [2]:
# Определим типы колонок для экономии памяти
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_adress_lat': np.float32,
    'pos_adress_lon': np.float32,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
}

# для экономии памяти будем загружать только часть атрибутов транзакций
usecols_train = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_adress_lat', 'pos_adress_lon', 'atm_address_lat', 'atm_address_lon','home_add_lat','home_add_lon','work_add_lat','work_add_lon']
usecols_test = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_address_lat', 'pos_address_lon', 'atm_address_lat', 'atm_address_lon']

## Читаем train_set, test_set, соединяем в один датасет

In [3]:
train = pd.read_csv('../0_data/train_set.csv', dtype = dtypes, usecols = usecols_train)
train.rename(columns = {'pos_adress_lat': 'pos_address_lat', 'pos_adress_lon': 'pos_address_lon'}, inplace = True)

test = pd.read_csv('../0_data/test_set.csv', dtype = dtypes, usecols = usecols_test)
submission = pd.DataFrame(test['customer_id'].unique(), columns = ['_ID_'])

# соединяем test/train в одном DataFrame
train['is_train'] = np.int32(1)
test['is_train'] = np.int32(0)
dt = pd.concat([train, test])

del train, test

In [4]:
dt.head(2)

Unnamed: 0,amount,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,is_train,mcc,pos_address_lat,pos_address_lon,transaction_date,work_add_lat,work_add_lon
0,2.884034,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,1,5261,59.844074,30.179153,2017-07-15,59.847,30.177
1,2.775633,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,1,5261,59.844074,30.179153,2017-10-27,59.847,30.177


### Обрабатываем дату транзакции и категориальные признаки

In [5]:
dt['currency'] = dt['currency'].fillna(-1).astype(np.int32)
dt['mcc'] = dt['mcc'].apply(lambda x: int(x.replace(',', ''))).astype(np.int32)
dt['city'] = dt['city'].factorize()[0].astype(np.int32)
dt['country'] = dt['country'].factorize()[0].astype(np.int32)

# удаляем транзакции без даты
dt.drop(dt[dt['transaction_date'].isnull()].index, axis = 0, inplace = True)
dt['transaction_date'] = dt['transaction_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

### Фичи для даты

In [6]:
dt['weekday'] = dt['transaction_date'].dt.weekday.astype(np.int32)

In [7]:
dt['month'] = dt['transaction_date'].dt.month.astype(np.int32)

In [8]:
dates = pd.read_csv('../0_data/calendar.csv', skiprows=1, index_col=0,
                    usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],  
                    names=['year', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'])

In [9]:
# предпраздничные дни
pre_holidays = set()
# праздничне дни
holidays = set()

for year in dates.index:
    for month in dates.columns:
        for day in dates.loc[year, month].split(','):
            if day.endswith('*'):
                pre_holidays.add(datetime.datetime(year, int(month), int(day[:len(day)-1]), 0, 0))
            else:
                holidays.add(datetime.datetime(year, int(month), int(day), 0, 0))

In [10]:
def determ(date, hol, pre_hol):
    if date in hol:
        return 'holiday'
    elif date in pre_hol:
        return 'pre-holiday'
    else: return 'workday'

In [11]:
dt['day_status'] = dt['transaction_date'].apply(lambda x: determ(x, holidays, pre_holidays))

In [12]:
dt = pd.concat([dt, pd.get_dummies(dt['day_status'], prefix="day_status")], axis=1)
dt.drop(['day_status'], axis=1, inplace=True)

### Приводим адрес транзакции для pos и atm-транзакций к единообразному виду

In [13]:
dt['is_atm'] = (~dt['atm_address_lat'].isnull()).astype(np.int32)
dt['is_pos'] = (~dt['pos_address_lat'].isnull()).astype(np.int32)

dt['address_lat'] = dt['atm_address_lat'].fillna(0) + dt['pos_address_lat'].fillna(0)
dt['address_lon'] = dt['atm_address_lon'].fillna(0) + dt['pos_address_lon'].fillna(0)

dt.drop(['atm_address_lat','atm_address_lon','pos_address_lat','pos_address_lon'], axis = 1, inplace = True)

# удалим транзакции без адреса
dt.drop(dt[((dt['address_lat'] == 0) & (dt['address_lon'] == 0))].index, axis = 0, inplace = True)

### Генерируем признаки is_home, is_work

In [14]:
lat = dt['home_add_lat'] - dt['address_lat']
lon = dt['home_add_lon'] - dt['address_lon']
dt['is_home'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
dt['has_home'] = (~dt['home_add_lon'].isnull()).astype(np.int32)

lat = dt['work_add_lat'] - dt['address_lat']
lon = dt['work_add_lon'] - dt['address_lon']
dt['is_work'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
dt['has_work'] = (~dt['work_add_lon'].isnull()).astype(np.int32)

dt.drop(['work_add_lat','work_add_lon','home_add_lat','home_add_lon'], axis = 1, inplace = True)

### Генерируем категориальный признак для адреса

In [15]:
dt.head(2)

Unnamed: 0,amount,city,country,currency,customer_id,is_train,mcc,transaction_date,weekday,month,...,day_status_pre-holiday,day_status_workday,is_atm,is_pos,address_lat,address_lon,is_home,has_home,is_work,has_work
0,2.884034,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,5,7,...,0,0,0,1,59.844074,30.179153,0,1,1,1
1,2.775633,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,4,10,...,0,1,0,1,59.844074,30.179153,0,1,1,1


In [16]:
dt['address'] = dt['address_lat'].apply(lambda x: "%.02f" % x) + ';' + dt['address_lon'].apply(lambda x: "%.02f" % x)
dt['address'] = dt['address'].factorize()[0].astype(np.int32)

### Генерируем несколько абонентских фич

In [17]:
# количество транзакций каждого клиента
dt = dt.merge(dt.groupby('customer_id')['amount'].count().reset_index(name = 'tx'), how = 'left')
dt['tx'] = dt['tx'].astype(np.int32)

dt = dt.merge(dt.groupby(['customer_id','address'])['amount'].count().reset_index(name = 'tx_cust_addr'), how = 'left')
dt['tx_cust_addr'] = dt['tx_cust_addr'].astype(np.int32)

# какая часть транзакций клиента приходится на данный адрес
dt['ratio1'] = dt['tx_cust_addr'] / dt['tx']

In [18]:
dt.head(2)

Unnamed: 0,amount,city,country,currency,customer_id,is_train,mcc,transaction_date,weekday,month,...,address_lat,address_lon,is_home,has_home,is_work,has_work,address,tx,tx_cust_addr,ratio1
0,2.884034,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,5,7,...,59.844074,30.179153,0,1,1,1,0,39,13,0.333333
1,2.775633,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,4,10,...,59.844074,30.179153,0,1,1,1,0,39,13,0.333333


In [19]:
d = dict()
for i in [5411, 5422, 5441, 5462, 5499, 5451, 5331, 5399, 5310, 5311]: d[i] = 'food'
    
for i in [5533, 5532, 5511, 5531, 5599, 7538, 7542,\
          7531, 7534, 5013, 5521, 5532, 5571,\
          7535, 7549, 7512]: d[i] = 'auto'

for i in [5542, 5541, 5983, 7523]: d[i] = 'parking_fuel'
for i in [4814, 4816, 4899, 4821, 4813]: d[i] = 'telephone'
for i in [7832, 7932, 7933, 7991, 7993, 7996,\
          7999, 7911, 7997, 7941, 7922, 7929,\
          7992,  7994,  7995,  7998]: d[i] = 'leisure_active'

for i in [2741, 5192, 5733, 5735, 5942, 5970,\
          5971, 5994, 7333, 7829, 7841]: d[i] = 'leisure_passive'

for i in [5811, 5812, 5813, 5814]: d[i] = 'restaurants'
for i in [5137, 5139, 5611, 5621, 5631, 5641,\
          5651, 5655, 5661, 5681, 5691, 5697,\
          5699, 5941, 5948, 5945]: d[i] = 'clothes'
for i in [4812, 5997, 5045, 5065, 5072, 5099,\
          5251, 5722, 5732, 5734, 5946]: d[i] = 'technics'
d[5912] = 'drugstore'
d[8099] = 'other_health'

for i in [7230, 7298, 8021, 8042, 8043, 8062,\
          4119, 5047, 5122, 5698, 5975, 5976,\
          5977, 7297, 8011, 8031, 8049, 8050, 8071]: d[i] = 'health'
for i in [3011,3026,3047,3351,3501,3503,3504,3509,3512,\
          3530,3533,3543,3553,3579,3586,3604,3616,3634,\
          3640,3642,3649,3665,3690,3692,3710,\
          3750,3778, 7011]: d[i] = 'travel'
for i in [9211,  9222,  9311,  9399,  9402]: d[i] = 'gos_uslugi'
for i in [8111,  8211,  8220,  8244,  8249,  8299,  8351,\
          8398,  8661,  8675,  8699, 8911,\
          8999, 7311,  7338,  7349,  7379,\
          7392,  7393,  7394,  7395,\
          7399]: d[i] = 'bussiness_uslugi'
for i in [7622,  7623,  7629,  7692,  7699,\
         7012,  7032,  7210,  7211,  7216,\
          7221,  7251,  7261,  7278,  7296,\
          7299, 742,  763,  780,  1520,  1711,  1731,  1740,\
          1750,  1761,  1799,  2842,  4111,  4112,  4121,  4131,\
          4214,  4215,  4225,  4411,  4457,  4511,  4582,  4722,\
          4784,  4789,  4900]: d[i] = 'other'
for i in [6010, 6011, 6300, 6513] : d[i] = 'Manual Cash'
for i in [5816,  5921,  5931,  5932,  5933,  5937,\
          5940,  5943,  5944,  5947,  5949,  5950,\
          5960,  5963,  5965,  5969,  5972,  5973,\
          5978,  5992,  5993,  5995,  5996,  5998,\
          5999, 5712, 5713, 5714, 5718, 5719, 5551,\
          5561, 5598, 5200,  5211,  5231,  5261,\
          5271,  5300,  5309, 5111,  5131,  5169,\
          5172,  5193,  5198,  5199, 5021,  5039,\
          5044,  5046,  5051,  5074,  5085,  5094]: d[i] = 'sales'

In [20]:
dt['mcc_category'] = dt['mcc'].apply(lambda x: d[x] if x in d else 0)

In [21]:
dt = pd.concat([dt, pd.get_dummies(dt['mcc_category'], prefix="mcc")], axis=1)
dt.drop(['mcc_category'], axis=1, inplace=True)

In [22]:
dt.drop(['mcc'], axis=1, inplace=True)

In [23]:
dt.head(2)

Unnamed: 0,amount,city,country,currency,customer_id,is_train,transaction_date,weekday,month,day_status_holiday,...,mcc_leisure_active,mcc_leisure_passive,mcc_other,mcc_other_health,mcc_parking_fuel,mcc_restaurants,mcc_sales,mcc_technics,mcc_telephone,mcc_travel
0,2.884034,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,2017-07-15,5,7,1,...,0,0,0,0,0,0,1,0,0,0
1,2.775633,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,2017-10-27,4,10,0,...,0,0,0,0,0,0,1,0,0,0


## Вспомогательные функции для оценки точности классификатора

In [24]:
def _best(x):
    ret = None
    for col in ys:
        pred = ('pred:%s' % col)
        if pred in x:
            i = (x[pred].idxmax())
            cols = [pred,'address_lat','address_lon']
            if col in x:
                cols.append(col)
            tmp = x.loc[i,cols]
            tmp.rename({
                'address_lat':'%s:add_lat' % col,
                'address_lon':'%s:add_lon' % col,
            }, inplace = True)
            if ret is None:
                ret = tmp
            else:
                ret = pd.concat([ret, tmp])
    return ret

In [25]:
def predict_proba(dt, ys = ['is_home', 'is_work']):
    for col in ys:
        pred = ('pred:%s' % col)
        dt[pred] = model[col].predict_proba(dt[xs])[:,1]
    return dt.groupby('customer_id').apply(_best).reset_index()

In [26]:
def score(dt, ys = ['is_home', 'is_work']):
    dt_ret = predict_proba(dt, ys)
    mean = 0.0
    for col in ys:
        col_mean = dt_ret[col].mean()
        mean += col_mean
    if len(ys) == 2:
        mean = mean / len(ys)
    return mean

### Признаки, на которых будем обучать модель

In [27]:
dt.head(2)

Unnamed: 0,amount,city,country,currency,customer_id,is_train,transaction_date,weekday,month,day_status_holiday,...,mcc_leisure_active,mcc_leisure_passive,mcc_other,mcc_other_health,mcc_parking_fuel,mcc_restaurants,mcc_sales,mcc_technics,mcc_telephone,mcc_travel
0,2.884034,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,2017-07-15,5,7,1,...,0,0,0,0,0,0,1,0,0,0
1,2.775633,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,2017-10-27,4,10,0,...,0,0,0,0,0,0,1,0,0,0


In [28]:
xs = ['amount','currency','city','country','is_atm','is_pos','ratio1', \
      'month', 'tx', 'tx_cust_addr', 'has_home', 'has_work']\
+ [x for x in dt.columns if 'mcc' in x] + [x for x in dt.columns if 'day' in x]
ys = ['is_home', 'is_work']

# Создаем классификаторы
**Hint**: можно поигратьcя с гиперпараметрами для лучшего результата :)

In [29]:
model0 = {
    'is_home': xgb.XGBClassifier(n_estimators = 200, n_jobs = -1),
    'is_work': xgb.XGBClassifier(n_estimators = 200, n_jobs = -1)
}

# Обучаем классификаторы

In [30]:
model = {}

# последовательно обучаем два классификатора
for col in ['is_home', 'is_work']:
    
    #выберем для обучение транзакции только тех клиентов из train, у которых хоть в одной транзакции указано место работы/жительства
    cust_train = dt[dt['is_train'] == 1].groupby('customer_id')[col.replace('is_','has_')].max()
    cust_train = cust_train[cust_train > 0].index
    
    #разобъем train на train/valid для валидации
    cust_train, cust_valid = train_test_split(cust_train, test_size = 0.1, shuffle = True, random_state = 2)
    
    train = pd.DataFrame(cust_train, columns = ['customer_id']).merge(dt, how = 'left')
    valid = pd.DataFrame(cust_valid, columns = ['customer_id']).merge(dt, how = 'left')

    print ("Training:", col)
    clf = sklearn.base.clone(model0[col])
    clf.fit(train[xs], train[col], eval_metric = 'logloss', eval_set = [(train[xs], train[col]), (valid[xs], valid[col])], verbose=10)
    
    model[col] = clf
    print ("Train accuracy:", score(train, ys = [col]))
    print ("Test accuracy:", score(valid, ys = [col]))
    print ()


Training: is_home
[0]	validation_0-logloss:0.659603	validation_1-logloss:0.657997
[10]	validation_0-logloss:0.520081	validation_1-logloss:0.511797
[20]	validation_0-logloss:0.491167	validation_1-logloss:0.481404
[30]	validation_0-logloss:0.482091	validation_1-logloss:0.472707
[40]	validation_0-logloss:0.477887	validation_1-logloss:0.469179
[50]	validation_0-logloss:0.475095	validation_1-logloss:0.466964
[60]	validation_0-logloss:0.47229	validation_1-logloss:0.464794
[70]	validation_0-logloss:0.470541	validation_1-logloss:0.463494
[80]	validation_0-logloss:0.468537	validation_1-logloss:0.462229
[90]	validation_0-logloss:0.466777	validation_1-logloss:0.461416
[100]	validation_0-logloss:0.465373	validation_1-logloss:0.461113
[110]	validation_0-logloss:0.464231	validation_1-logloss:0.460327
[120]	validation_0-logloss:0.462787	validation_1-logloss:0.459672
[130]	validation_0-logloss:0.461901	validation_1-logloss:0.459252
[140]	validation_0-logloss:0.46063	validation_1-logloss:0.458543
[150]

# Predict

In [31]:
cust_test = dt[dt['is_train'] == 0]['customer_id'].unique()
test = pd.DataFrame(cust_test, columns = ['customer_id']).merge(dt, how = 'left')
test = predict_proba(test)
test.rename(columns = {
        'customer_id':'_ID_',
        'is_home:add_lat': '_HOME_LAT_',
        'is_home:add_lon': '_HOME_LON_',
        'is_work:add_lat': '_WORK_LAT_',
        'is_work:add_lon': '_WORK_LON_'}, inplace = True)
test = test[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]

# Формируем submission-файл

In [32]:
# Заполняем пропуски
submission = submission.merge(test, how = 'left').fillna(0)

# Пишем файл submission
submission.to_csv('baseline-very-simple-ver_4.csv', index = False)