In [4]:
import pandas as pd
import numpy as np
import datetime
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import pycountry

import xgboost as xgb
import sklearn

from sklearn.model_selection import train_test_split

import urllib.request as urllib2
import json

from tqdm import tqdm, tqdm_pandas

import reverse_geocoder as rg

%matplotlib inline

tqdm.pandas()

In [5]:
def save_state(df, name):
    df.to_csv(name, index = False)

In [6]:
# Определим типы колонок для экономии памяти
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_adress_lat': np.float32,
    'pos_adress_lon': np.float32,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
}

# для экономии памяти будем загружать только часть атрибутов транзакций
usecols_train = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_adress_lat', 'pos_adress_lon', 'atm_address_lat', 'atm_address_lon','home_add_lat','home_add_lon','work_add_lat','work_add_lon']
usecols_test = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_address_lat', 'pos_address_lon', 'atm_address_lat', 'atm_address_lon']

## Читаем train_set, test_set, соединяем в один датасет

In [7]:
train = pd.read_csv('train_set.csv', dtype = dtypes, usecols = usecols_train)
train.rename(columns = {'pos_adress_lat': 'pos_address_lat', 'pos_adress_lon': 'pos_address_lon'}, inplace = True)

test = pd.read_csv('test_set.csv', dtype = dtypes, usecols = usecols_test)

# соединяем test/train в одном DataFrame
train['is_train'] = np.int32(1)
test['is_train'] = np.int32(0)
dt = pd.concat([train, test])

del train, test

### Приводим адрес транзакции для pos и atm-транзакций к единообразному виду

In [8]:
dt['is_atm'] = (~dt['atm_address_lat'].isnull()).astype(np.int32)
dt['is_pos'] = (~dt['pos_address_lat'].isnull()).astype(np.int32)

dt['address_lat'] = dt['atm_address_lat'].fillna(0) + dt['pos_address_lat'].fillna(0)
dt['address_lon'] = dt['atm_address_lon'].fillna(0) + dt['pos_address_lon'].fillna(0)

dt.drop(['atm_address_lat','atm_address_lon','pos_address_lat','pos_address_lon'], axis = 1, inplace = True)

# удалим транзакции без адреса
dt.drop(dt[((dt['address_lat'] == 0) & (dt['address_lon'] == 0))].index, axis = 0, inplace = True)

### Генерируем признаки is_home, is_work

In [9]:
lat = dt['home_add_lat'] - dt['address_lat']
lon = dt['home_add_lon'] - dt['address_lon']
dt['is_home'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
dt['has_home'] = (~dt['home_add_lon'].isnull()).astype(np.int32)

lat = dt['work_add_lat'] - dt['address_lat']
lon = dt['work_add_lon'] - dt['address_lon']
dt['is_work'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
dt['has_work'] = (~dt['work_add_lon'].isnull()).astype(np.int32)

In [10]:
dt.drop(['work_add_lat','work_add_lon','home_add_lat','home_add_lon'], axis = 1, inplace = True)

## Создаем словарь для приведения стран к одному формату (2 символа)

In [11]:
dict_countries = {x.alpha_3: x.alpha_2 for x in pycountry.countries}
dict_countries['ROM'] = 'RO'

In [12]:
dt = dt.replace({'country': dict_countries})
dt['country'] = dt['country'].progress_apply(lambda x: x.strip())

100%|████████████████████████████████████████████████████████████████████| 2172472/2172472 [00:02<00:00, 951331.38it/s]


In [13]:
dt['country'].value_counts()

RU    2172472
Name: country, dtype: int64

In [14]:
dt.drop(['country'], axis=1, inplace=True)

### Работаем над городами

In [15]:
fix_city = {'MOSKVA' : 'MOSCOW',
'MOSKOW' : 'MOSCOW',
'G MOSKVA' : 'MOSCOW',
'MOSKVA G' : 'MOSCOW',
'SANKT-PETERBU' : 'ST. PETERSBURG',
'SANKT-PETEBUR' : 'ST. PETERSBURG',
'ST-PETERSBURG' : 'ST. PETERSBURG',
'ST PETERSBURG' : 'ST. PETERSBURG',
'ST-PETERBURG' : 'ST. PETERSBURG',
'ST PETERBURG' : 'ST. PETERSBURG',
'St Petersburg' : 'ST. PETERSBURG',
'SAINT PETERSB' : 'ST. PETERSBURG',
'ST.PETERSBURG' : 'ST. PETERSBURG',
'SANKT-PETERSB' : 'ST. PETERSBURG',
'SAINT-PETERSB' : 'ST. PETERSBURG',
'ST.-PETERSBUR' : 'ST. PETERSBURG',
'SANKT-PETERS' : 'ST. PETERSBURG',
'S-PETERSBURG' : 'ST. PETERSBURG',
'ST-PETERB.' : 'ST. PETERSBURG',
'SPB' : 'ST. PETERSBURG',
'SPETERSBURG' : 'ST. PETERSBURG',
'SANKT PETERBU' : 'ST. PETERSBURG',
'S-PETERBURG' : 'ST. PETERSBURG',
'S.PETERBURG' : 'ST. PETERSBURG',
'SAINT-PETERBU' : 'ST. PETERSBURG',
'NVSIBR' : 'NOVOSIBIRSK',
'N NOVGOROD' : 'N.NOVGOROD',
'NIZHNY NOVGOR' : 'N.NOVGOROD',
'NIZJNIY NOVGO' : 'N.NOVGOROD',
'G. NIGNIY NOV' : 'N.NOVGOROD',
'NIZHNIY NO' : 'N.NOVGOROD',
'NIZHNIY NOVGO' : 'N.NOVGOROD',
'NIZH NOVGOROD' : 'N.NOVGOROD',
'CHEREPOVEC' : 'CHEREPOVETS',
'ROSTOV NA DON' : 'ROSTOV-NA-DON',
'VORONEJ' : 'VORONEZH',
'MO' : 'MOSCOW REGION',
'MOSKOVSKIY' : 'MOSCOW REGION',
'MOSKOVSKAYA O' : 'MOSCOW REGION',
'MO BALASHIHA' : "BALASHIHA",
'MO,BALASHIKHA' : "BALASHIHA",
'MO,KHIMKI' : "KHIMKI",
'NIZHNIY-NOVGO' : 'N.NOVGOROD',
'ETATERINBURG' : 'EKATERINBURG',
'EKATERINB' : 'EKATERINBURG',
'EKATERINBURG,' : 'EKATERINBURG',
'YEKATR' : 'EKATERINBURG',
'KRASNO-SK' : 'KRASNOYARSK',
'ODINCOVO' : 'ODINTSOVO',
'MO, SERPUKHOV' : 'SERPUKHOV',
'MO, LYUBERETS' : 'LYUBERETSY',
'LYBERTSY' : 'LYUBERETSY',
'MO LYUBERCI' : 'LYUBERETSY',
'MOSKV*' : 'MOSCOW',
'*MOSCOW' : 'MOSCOW',
'G TYUMEN, NP' : 'TYUMEN',
'MOSKVA,G. MOS' : 'MOSCOW',
'MOS. OBL' : 'MOSCOW REGION',
'MOS. OBL.' : 'MOSCOW REGION',
'G SARATOV' : 'SARATOV',
'IRKUTS' : 'IRKUTSK',
'SERPUHOV' : 'SERPUKHOV',
'R-N-DONU' : 'ROSTOV-NA-DON',
'LIPECK' : 'LIPETSK',
'DOMODEDOVO, S' : 'DOMODEDOVO',
'ROS-NA-DONU' : 'ROSTOV-NA-DON',
'ROSTOV' : 'ROSTOV-NA-DON',
'MYTISHI' : 'MYTISHCHI',
'MYTISCHI' : 'MYTISHCHI',
'ST. PETERSBUR' : 'ST. PETERSBURG', 
'NOVOROSSIISK' : 'NOVOROSSIJSK',
'NOVOROSSIIYSK' : 'NOVOROSSIJSK',
'BRJANSK' : 'BRYANSK',
'NOVOMICHURINS' : 'NOVOMICHURINSK',
'MOSKVA,G. ZEL' : 'MOSCOW'}

In [16]:
dt['city'] = dt['city'].apply(str)
dt['city'] = dt['city'].apply(lambda x: x.upper())
dt['city'] = dt['city'].apply(lambda x: x.strip())

In [17]:
dt = dt.replace({'city': fix_city})

#### Backup block beginning

In [18]:
save_state(dt, 'state1.csv')

In [49]:
dt = pd.read_csv('state1.csv')

  interactivity=interactivity, compiler=compiler, result=result)


#### Backup block ending

In [19]:
tuples = [tuple(x) for x in dt[['address_lat', 'address_lon']].values]

In [20]:
results = rg.search(tuples)

Loading formatted geocoded file...


In [21]:
names = []
admin1s = []
for ordered_dict in results:
    names.append(ordered_dict['name'])
    admin1s.append(ordered_dict['admin1'])

In [22]:
dt['names'] = pd.Series(names).values
dt['admin1s'] = pd.Series(admin1s).values

In [23]:
dt.head()

Unnamed: 0,amount,city,currency,customer_id,is_train,mcc,transaction_date,is_atm,is_pos,address_lat,address_lon,is_home,has_home,is_work,has_work,names,admin1s
0,2.884034,ST. PETERSBURG,643.0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,0,1,59.844074,30.179153,0,1,1,1,Uritsk,St.-Petersburg
1,2.775633,ST. PETERSBURG,643.0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,0,1,59.844074,30.179153,0,1,1,1,Uritsk,St.-Petersburg
2,3.708368,ST. PETERSBURG,643.0,0dc0137d280a2a82d2dc89282450ff1b,1,5992,2017-10-03,0,1,59.8582,30.229023,1,1,0,1,Dachnoye,St.-Petersburg
3,2.787498,ST. PETERSBURG,643.0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-09-09,0,1,59.844074,30.179153,0,1,1,1,Uritsk,St.-Petersburg
4,2.89251,ST. PETERSBURG,643.0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-06,0,1,59.844074,30.179153,0,1,1,1,Uritsk,St.-Petersburg


In [24]:
save_state(dt, 'state2.csv')

In [25]:
cities = dt[['city', 'names', 'admin1s']]

In [26]:
cities.head(100)

Unnamed: 0,city,names,admin1s
0,ST. PETERSBURG,Uritsk,St.-Petersburg
1,ST. PETERSBURG,Uritsk,St.-Petersburg
2,ST. PETERSBURG,Dachnoye,St.-Petersburg
3,ST. PETERSBURG,Uritsk,St.-Petersburg
4,ST. PETERSBURG,Uritsk,St.-Petersburg
5,ST. PETERSBURG,Uritsk,St.-Petersburg
6,ST. PETERSBURG,Uritsk,St.-Petersburg
7,ST. PETERSBURG,Uritsk,St.-Petersburg
8,ST. PETERSBURG,Uritsk,St.-Petersburg
9,ST. PETERSBURG,Uritsk,St.-Petersburg


In [27]:
dt.drop(['city','names'], axis = 1, inplace = True)

In [28]:
dt.rename(columns={'admin1s': 'city'}, inplace=True)

In [29]:
dt.head()

Unnamed: 0,amount,currency,customer_id,is_train,mcc,transaction_date,is_atm,is_pos,address_lat,address_lon,is_home,has_home,is_work,has_work,city
0,2.884034,643.0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,0,1,59.844074,30.179153,0,1,1,1,St.-Petersburg
1,2.775633,643.0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,0,1,59.844074,30.179153,0,1,1,1,St.-Petersburg
2,3.708368,643.0,0dc0137d280a2a82d2dc89282450ff1b,1,5992,2017-10-03,0,1,59.8582,30.229023,1,1,0,1,St.-Petersburg
3,2.787498,643.0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-09-09,0,1,59.844074,30.179153,0,1,1,1,St.-Petersburg
4,2.89251,643.0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-06,0,1,59.844074,30.179153,0,1,1,1,St.-Petersburg


In [30]:
cities_dict = list(dt['city'].value_counts().keys())

In [31]:
dt['city'] = dt['city'].apply(lambda x: cities_dict.index(x))

### Обрабатываем дату транзакции и категориальные признаки

In [32]:
dt['currency'] = dt['currency'].fillna(-1).astype(np.int32)
dt['mcc'] = dt['mcc'].apply(lambda x: int(x.replace(',', ''))).astype(np.int32)
#dt['city'] = dt['city'].factorize()[0].astype(np.int32)
#dt['country'] = dt['country'].factorize()[0].astype(np.int32)

# удаляем транзакции без даты
dt.drop(dt[dt['transaction_date'].isnull()].index, axis = 0, inplace = True)
dt['transaction_date'] = dt['transaction_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

### Фичи для даты

In [47]:
dt['day'] = dt['transaction_date'].dt.day.astype(np.int32)
dt['month'] = dt['transaction_date'].dt.month.astype(np.int32)
dt['year'] = dt['transaction_date'].dt.year.astype(np.int32)
dt['is_year_end'] = dt['transaction_date'].dt.is_year_end.astype(np.int32)
dt['dayofweek'] = dt['transaction_date'].dt.dayofweek.astype(np.int32)

In [48]:
cal = calendar()
holidays = cal.holidays(start=dt['transaction_date'].min(), end=dt['transaction_date'].max())

In [49]:
dt['is_holiday'] = dt['transaction_date'].isin(holidays).astype(np.int32)

In [50]:
dt['year_month'] =  100 * dt['year'] + dt['month']

In [51]:
dt.head()

Unnamed: 0,amount,currency,customer_id,is_train,mcc,transaction_date,is_atm,is_pos,address_lat,address_lon,...,has_work,city,mcc2,day,is_year_end,dayofweek,is_holiday,month,year,year_month
0,2.884034,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,0,1,59.844074,30.179153,...,1,2,0,15,0,5,0,7,2017,201707
1,2.775633,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,0,1,59.844074,30.179153,...,1,2,0,27,0,4,0,10,2017,201710
2,3.708368,643,0dc0137d280a2a82d2dc89282450ff1b,1,5992,2017-10-03,0,1,59.8582,30.229023,...,1,2,1,3,0,1,0,10,2017,201710
3,2.787498,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-09-09,0,1,59.844074,30.179153,...,1,2,0,9,0,5,0,9,2017,201709
4,2.89251,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-06,0,1,59.844074,30.179153,...,1,2,0,6,0,3,0,7,2017,201707


### Преобразуем валюты

In [52]:
dt['currency'].unique()

array([643, 978, 784, 840], dtype=int64)

In [53]:
currency_dict = dict()
currency_dict[643] = 'RUB'
currency_dict[978] = 'EUR'
currency_dict[784] = 'AED'
currency_dict[840] = 'USD'

In [54]:
dt = dt.replace({'currency': currency_dict})

In [55]:
currency_list = list(dt['currency'].value_counts().keys())

In [56]:
dt['currency'] = dt['currency'].apply(lambda x: currency_list.index(x))

### Плотность транзакций юзера для каждой транзакции в кольцах (0.02, 0.05)

In [57]:
data = pd.read_csv('dd.csv')

In [58]:
data.columns

Index(['Unnamed: 0', 'density002', 'distance', 'min_dist', 'median_dist',
       'x_median', 'y_median'],
      dtype='object')

In [63]:
dt['density002'] = data['density002']
dt['distance'] = data['distance']
dt['min_dist'] = data['min_dist']
dt['median_dist'] = data['median_dist']
dt['x_median'] = data['x_median']
dt['y_median'] = data['y_median']

In [64]:
dt.head()

Unnamed: 0,amount,currency,customer_id,is_train,mcc,transaction_date,is_atm,is_pos,address_lat,address_lon,...,is_holiday,month,year,year_month,density002,distance,min_dist,median_dist,x_median,y_median
0,2.884034,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,0,1,59.844074,30.179153,...,0,7,2017,201707,0.410256,0.000719,0.0,0.004338,59.848,30.181
1,2.775633,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,0,1,59.844074,30.179153,...,0,10,2017,201710,0.410256,0.000719,0.0,0.004338,59.848,30.181
2,3.708368,0,0dc0137d280a2a82d2dc89282450ff1b,1,5992,2017-10-03,0,1,59.8582,30.229023,...,0,10,2017,201710,0.307692,0.00752,0.0002,0.049095,59.848,30.181
3,2.787498,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-09-09,0,1,59.844074,30.179153,...,0,9,2017,201709,0.410256,0.000719,0.0,0.004338,59.848,30.181
4,2.89251,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-06,0,1,59.844074,30.179153,...,0,7,2017,201707,0.410256,0.000719,0.0,0.004338,59.848,30.181


In [65]:
dt['density002'] = dt['density002'].fillna(-1). 
dt['distance'] = dt['distance'].fillna(-1)
dt['min_dist'] = dt['min_dist'].fillna(-1)
dt['median_dist'] = dt['median_dist'].fillna(-1)
dt['x_median'] = dt['x_median'].fillna(-1) 
dt['y_median'] = dt['y_median'].fillna(-1)

SyntaxError: invalid syntax (<ipython-input-65-2eef049ec93f>, line 1)

In [66]:
dt.head()

Unnamed: 0,amount,currency,customer_id,is_train,mcc,transaction_date,is_atm,is_pos,address_lat,address_lon,...,is_holiday,month,year,year_month,density002,distance,min_dist,median_dist,x_median,y_median
0,2.884034,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,0,1,59.844074,30.179153,...,0,7,2017,201707,0.410256,0.000719,0.0,0.004338,59.848,30.181
1,2.775633,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,0,1,59.844074,30.179153,...,0,10,2017,201710,0.410256,0.000719,0.0,0.004338,59.848,30.181
2,3.708368,0,0dc0137d280a2a82d2dc89282450ff1b,1,5992,2017-10-03,0,1,59.8582,30.229023,...,0,10,2017,201710,0.307692,0.00752,0.0002,0.049095,59.848,30.181
3,2.787498,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-09-09,0,1,59.844074,30.179153,...,0,9,2017,201709,0.410256,0.000719,0.0,0.004338,59.848,30.181
4,2.89251,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-06,0,1,59.844074,30.179153,...,0,7,2017,201707,0.410256,0.000719,0.0,0.004338,59.848,30.181


### Генерируем категориальный признак для адреса

In [67]:
dt['address'] = dt['address_lat'].apply(lambda x: "%.02f" % x) + ';' + dt['address_lon'].apply(lambda x: "%.02f" % x)
dt['address'] = dt['address'].factorize()[0].astype(np.int32)

### Генерируем несколько абонентских фич

In [68]:
# количество транзакций каждого клиента
dt = dt.merge(dt.groupby('customer_id')['amount'].count().reset_index(name = 'tx'), how = 'left')
dt['tx'] = dt['tx'].astype(np.int32)

dt = dt.merge(dt.groupby(['customer_id','address'])['amount'].count().reset_index(name = 'tx_cust_addr'), how = 'left')
dt['tx_cust_addr'] = dt['tx_cust_addr'].astype(np.int32)

# какая часть транзакций клиента приходится на данный адрес
dt['ratio1'] = dt['tx_cust_addr'] / dt['tx']

In [69]:
dt.head()

Unnamed: 0,amount,currency,customer_id,is_train,mcc,transaction_date,is_atm,is_pos,address_lat,address_lon,...,density002,distance,min_dist,median_dist,x_median,y_median,address,tx,tx_cust_addr,ratio1
0,2.884034,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,0,1,59.844074,30.179153,...,0.410256,0.000719,0.0,0.004338,59.848,30.181,0,39,13,0.333333
1,2.775633,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,0,1,59.844074,30.179153,...,0.410256,0.000719,0.0,0.004338,59.848,30.181,0,39,13,0.333333
2,3.708368,0,0dc0137d280a2a82d2dc89282450ff1b,1,5992,2017-10-03,0,1,59.8582,30.229023,...,0.307692,0.00752,0.0002,0.049095,59.848,30.181,1,39,5,0.128205
3,2.787498,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-09-09,0,1,59.844074,30.179153,...,0.410256,0.000719,0.0,0.004338,59.848,30.181,0,39,13,0.333333
4,2.89251,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-06,0,1,59.844074,30.179153,...,0.410256,0.000719,0.0,0.004338,59.848,30.181,0,39,13,0.333333


### Новые координаты

In [78]:
dt["rot45_X"] = .707 * dt["address_lon"] + .707 * dt["address_lat"]
dt["rot45_Y"] = .707 * dt["address_lon"] - .707 * dt["address_lat"]

dt["rot30_X"] = (1.732/2) * dt["address_lat"] + (1./2) * dt["address_lon"] 
dt["rot30_Y"] = (1.732/2) * dt["address_lon"] - (1./2) * dt["address_lat"]

dt["rot60_X"] = (1./2) * dt["address_lat"] + (1.732/2) * dt["address_lon"] 
dt["rot60_Y"] = (1./2) * dt["address_lon"] - (1.732/2) * dt["address_lat"]

dt["radial_r"] = np.sqrt(np.power(dt["address_lon"],2) + np.power(dt["address_lat"],2))

## Вспомогательные функции для оценки точности классификатора

In [79]:
def _best(x):
    ret = None
    for col in ys:
        pred = ('pred:%s' % col)
        if pred in x:
            i = (x[pred].idxmax())
            cols = [pred,'address_lat','address_lon']
            if col in x:
                cols.append(col)
            tmp = x.loc[i,cols]
            tmp.rename({
                'address_lat':'%s:add_lat' % col,
                'address_lon':'%s:add_lon' % col,
            }, inplace = True)
            if ret is None:
                ret = tmp
            else:
                ret = pd.concat([ret, tmp])
    return ret

In [80]:
def predict_proba(dt, ys = ['is_home', 'is_work']):
    for col in ys:
        pred = ('pred:%s' % col)
        dt[pred] = model[col].predict_proba(dt[xs])[:,1]
    return dt.groupby('customer_id').apply(_best).reset_index()

In [81]:
def score(dt, ys = ['is_home', 'is_work']):
    dt_ret = predict_proba(dt, ys)
    mean = 0.0
    for col in ys:
        col_mean = dt_ret[col].mean()
        mean += col_mean
    if len(ys) == 2:
        mean = mean / len(ys)
    return mean

### Признаки, на которых будем обучать модель

In [82]:
dt.head()

Unnamed: 0,amount,currency,customer_id,is_train,mcc,transaction_date,is_atm,is_pos,address_lat,address_lon,...,tx,tx_cust_addr,ratio1,rot45_X,rot45_Y,rot30_X,rot30_Y,rot60_X,rot60_Y,radial_r
0,2.884034,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,0,1,59.844074,30.179153,...,39,13,0.333333,63.646422,-20.973099,66.914545,-3.78689,56.057184,-36.735392,67.023094
1,2.775633,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,0,1,59.844074,30.179153,...,39,13,0.333333,63.646422,-20.973099,66.914545,-3.78689,56.057184,-36.735392,67.023094
2,3.708368,0,0dc0137d280a2a82d2dc89282450ff1b,1,5992,2017-10-03,0,1,59.8582,30.229023,...,39,5,0.128205,63.691667,-20.947828,66.951713,-3.750766,56.107434,-36.72269,67.058167
3,2.787498,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-09-09,0,1,59.844074,30.179153,...,39,13,0.333333,63.646422,-20.973099,66.914545,-3.78689,56.057184,-36.735392,67.023094
4,2.89251,0,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-06,0,1,59.844074,30.179153,...,39,13,0.333333,63.646422,-20.973099,66.914545,-3.78689,56.057184,-36.735392,67.023094


In [83]:
#xs = ['amount','currency','city','country','mcc','is_atm','is_pos','ratio1']
ys = ['is_home', 'is_work']
xs = list(set(dt.columns) - set(ys) - set(['is_train', 'address_lat', 'address_lon', \
                                           'transaction_date', 'tx', 'tx_cust_addr', 'customer_id', 'density002']))

In [84]:
xs

['has_home',
 'y_median',
 'month',
 'is_pos',
 'mcc2',
 'year_month',
 'is_atm',
 'city',
 'mcc',
 'dayofweek',
 'rot30_X',
 'ratio1',
 'year',
 'x_median',
 'is_holiday',
 'rot30_Y',
 'is_year_end',
 'has_work',
 'address',
 'rot45_Y',
 'currency',
 'median_dist',
 'radial_r',
 'distance',
 'day',
 'rot60_X',
 'amount',
 'rot45_X',
 'rot60_Y',
 'min_dist']

# Создаем классификаторы
**Hint**: можно поигратьcя с гиперпараметрами для лучшего результата :)

In [85]:
model0 = {
    'is_home': xgb.XGBClassifier(max_depth = 8, n_estimators = 100, learning_rate=0.1, n_jobs = -1),
    'is_work': xgb.XGBClassifier(max_depth = 8, n_estimators = 100, learning_rate=0.1, n_jobs = -1),
}

# Обучаем классификаторы

In [86]:
model = {}

# последовательно обучаем два классификатора
for col in ['is_home', 'is_work']:
    
    #выберем для обучение транзакции только тех клиентов из train, у которых хоть в одной транзакции указано место работы/жительства
    cust_train = dt[dt['is_train'] == 1].groupby('customer_id')[col.replace('is_','has_')].max()
    cust_train = cust_train[cust_train > 0].index
    
    #разобъем train на train/valid для валидации
    cust_train, cust_valid = train_test_split(cust_train, test_size = 0.1, shuffle = True, random_state = 2)
    
    train = pd.DataFrame(cust_train, columns = ['customer_id']).merge(dt, how = 'left')
    valid = pd.DataFrame(cust_valid, columns = ['customer_id']).merge(dt, how = 'left')

    print ("Training:", col)
    clf = sklearn.base.clone(model0[col])
    clf.fit(train[xs], train[col], eval_metric = 'logloss', eval_set = [(train[xs], train[col]), (valid[xs], valid[col])], verbose=10)
    model[col] = clf
    print ("Train accuracy:", score(train, ys = [col]))
    print ("Test accuracy:", score(valid, ys = [col]))
    print ()


Training: is_home
[0]	validation_0-logloss:0.654758	validation_1-logloss:0.655156
[10]	validation_0-logloss:0.487413	validation_1-logloss:0.496747
[20]	validation_0-logloss:0.443426	validation_1-logloss:0.463257
[30]	validation_0-logloss:0.421596	validation_1-logloss:0.453436
[40]	validation_0-logloss:0.40634	validation_1-logloss:0.449024
[50]	validation_0-logloss:0.395453	validation_1-logloss:0.446496
[60]	validation_0-logloss:0.385897	validation_1-logloss:0.446075
[70]	validation_0-logloss:0.378545	validation_1-logloss:0.44528
[80]	validation_0-logloss:0.369094	validation_1-logloss:0.445836
[90]	validation_0-logloss:0.361707	validation_1-logloss:0.445567
[99]	validation_0-logloss:0.355842	validation_1-logloss:0.445378
Train accuracy: 0.4752222222222222
Test accuracy: 0.412

Training: is_work
[0]	validation_0-logloss:0.63885	validation_1-logloss:0.642084
[10]	validation_0-logloss:0.396934	validation_1-logloss:0.425868
[20]	validation_0-logloss:0.330108	validation_1-logloss:0.379786
[3

# Predict

In [87]:
cust_test = dt[dt['is_train'] == 0]['customer_id'].unique()
test = pd.DataFrame(cust_test, columns = ['customer_id']).merge(dt, how = 'left')
test = predict_proba(test)
test.rename(columns = {
        'customer_id':'_ID_',
        'is_home:add_lat': '_HOME_LAT_',
        'is_home:add_lon': '_HOME_LON_',
        'is_work:add_lat': '_WORK_LAT_',
        'is_work:add_lon': '_WORK_LON_'}, inplace = True)
test = test[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]

# Формируем submission-файл

In [88]:
test_set = pd.read_csv('test_set.csv', dtype = dtypes, usecols = usecols_test)
submission = pd.DataFrame(test_set['customer_id'].unique(), columns = ['_ID_'])
# Заполняем пропуски
submission = submission.merge(test, how = 'left').fillna(0)

# Пишем файл submission
submission.to_csv('baseline-very-simple.csv', index = False)