# [Boosters] Raiffeisen Data Cup. Baseline
Общий подход:
- Добавляем к каждой транзакции столбец: is_work (если транзакция находится в пределах 0.02 от дома клиента)
- Добавляем к каждой транзакции столбец: is_home (если транзакция находится в пределах 0.02 от работы клиента)
- Обучаем классификатор предсказывающий вероятность (is_home == 1) для транзакции
- Обучаем классификатор предсказывающий вероятность (is_work == 1) для транзакции

Точность определения местоположения:
- для классификатора is_home: ~3x%
- для классификатора is_work: ~2x%
- общая оценка на Public Leaderboard: ???

Примечание
* Требуется Python версии 3.5
* Требуется библиотека xgboost (для обучения использовалась xgboost версии 0.7.post3)
* Требуются файлы: test_set.csv, train_set.csv в одном каталоге с данным скриптом
* Требования к памяти: должно работать с 2Гб свободного RAM
* Время работы: ~3 минуты (тестировалось на процессоре Intel Core i7-4770)

In [1]:
import pandas as pd
import numpy as np
import datetime

import xgboost as xgb
import sklearn

from sklearn.model_selection import train_test_split

In [2]:
# Определим типы колонок для экономии памяти
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_adress_lat': np.float32,
    'pos_adress_lon': np.float32,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
}

# для экономии памяти будем загружать только часть атрибутов транзакций
usecols_train = ['customer_id', 'terminal_id', 'transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_adress_lat', 'pos_adress_lon', 'atm_address_lat', 'atm_address_lon','home_add_lat','home_add_lon','work_add_lat','work_add_lon']
usecols_test = ['customer_id', 'terminal_id', 'transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_address_lat', 'pos_address_lon', 'atm_address_lat', 'atm_address_lon']

## Читаем train_set, test_set, соединяем в один датасет

In [3]:
train = pd.read_csv('train_set.csv', dtype = dtypes, usecols = usecols_train)
train.rename(columns = {'pos_adress_lat': 'pos_address_lat', 'pos_adress_lon': 'pos_address_lon'}, inplace = True)


test = pd.read_csv('test_set.csv', dtype = dtypes, usecols = usecols_test)
submission = pd.DataFrame(test['customer_id'].unique(), columns = ['_ID_'])

#### mcc

In [4]:
train['mcc'] = train['mcc'].apply(lambda x: int(x.replace(',', ''))).astype(np.int32)
test['mcc'] = test['mcc'].apply(lambda x: int(x.replace(',', ''))).astype(np.int32)

In [5]:
mcc_grouped = train.groupby('mcc')['customer_id'].count().sort_values(ascending=False).to_frame().reset_index()

In [6]:
lst = []

with open('mcc.csv', 'r') as f:
    for line in f.readlines():
        lst.append(line.strip().split(';'))

mcc_codes = pd.DataFrame(lst, columns=['mcc', 'description', 'group'])
mcc_codes['mcc'] = mcc_codes.mcc.astype(int)

In [7]:
# mcc_group_codes = mcc_grouped.merge(mcc_codes, how='left')

In [8]:
# all_trans = mcc_group_codes.customer_id.sum()
# mcc_group_codes['part'] = mcc_group_codes.customer_id.apply(lambda x: x / all_trans)

In [9]:
# train['address_lat'] = train['atm_address_lat'].fillna(0) + train['pos_address_lat'].fillna(0)
# train['address_lon'] = train['atm_address_lon'].fillna(0) + train['pos_address_lon'].fillna(0)

In [10]:
# for idx in mcc_group_codes.index:
#     current_mcc = mcc_group_codes.get_value(idx, 'mcc')
#     all_trans = mcc_group_codes.get_value(idx, 'customer_id')
    
#     df = train[train.mcc == current_mcc][['address_lat','address_lon', 'home_add_lat', 'home_add_lon']].dropna()
#     df['dist'] = ((df.address_lat - df.home_add_lat) ** 2 + (df.address_lon - df.home_add_lon) ** 2) ** 0.5
    
#     distances = df.dist.values
#     mcc_group_codes.set_value(idx, 'percent_near_home', len(distances[distances <= 0.02]) / all_trans)

In [11]:
# train.drop(['address_lat','address_lon'], axis = 1, inplace = True)

In [12]:
mcc_grouped.head()

Unnamed: 0,mcc,customer_id
0,5411,391635
1,6011,281885
2,5814,128771
3,5812,62407
4,5499,44703


In [13]:
# train = train.merge(mcc_group_codes[['mcc', 'part', 'percent_near_home']], how='left')
# test = test.merge(mcc_group_codes[['mcc', 'part', 'percent_near_home']], how='left')

In [14]:
# соединяем test/train в одном DataFrame
train['is_train'] = np.int32(1)
test['is_train'] = np.int32(0)
dt = pd.concat([train, test], ignore_index=True)

del train, test

In [15]:
# dt['part'] = dt['part'].fillna(-1).astype(np.float64)
# dt['percent_near_home'] = dt['percent_near_home'].fillna(-1).astype(np.float64)

In [16]:
dt = dt.merge(mcc_codes[['mcc', 'group']], how='left')

In [17]:
customer_cars = {x: 0 for x in dt[dt.group == 'Автомобили и транспортные средства']['customer_id'].unique()}

In [18]:
dt['has_car'] = dt.group.apply(lambda x: 1 if x in customer_cars else 0)

In [19]:
dt.drop(dt[dt.mcc == 5542].index, inplace=True)

In [20]:
from sklearn.preprocessing import OneHotEncoder

In [21]:
enc = OneHotEncoder(sparse=False)

In [22]:
dt['group'] = dt['group'].fillna(-1).factorize()[0].astype(np.int32)

In [23]:
encoded_group = pd.DataFrame(enc.fit_transform(dt[['group']]))
del dt['group']
dt = pd.concat([dt, encoded_group], axis=1)

### Обрабатываем дату транзакции и категориальные признаки

In [24]:
dt['currency'] = dt['currency'].fillna(-1).astype(np.int32)
# dt['mcc'] = dt['mcc'].apply(lambda x: int(x.replace(',', ''))).astype(np.int32)

dt_abroad = dt[~dt.country.isin(['RUS', 'RU '])]
dt_abroad_dict = {x: 0 for x in dt_abroad.customer_id.unique()}

del dt_abroad

dt['was_abroad'] = dt.customer_id.apply(lambda x: 1 if x in dt_abroad_dict else 0)

dt['country'] = dt['country'].apply(lambda x: 'RUS' if x == 'RU ' else x)
dt['country'] = dt['country'].factorize()[0].astype(np.int32)

# удаляем транзакции без даты
dt.drop(dt[dt['transaction_date'].isnull()].index, axis = 0, inplace = True)
dt['transaction_date'] = dt['transaction_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

### Города

In [25]:
# with open('cities.txt', 'w') as f:
#     for city in dt['city'].unique():
#         if type(city) == float:
#             continue
#         f.write(city + '\n')

In [26]:
# def convert_city(city):
#     if type(city) == float:
#         return city
#     elif 'PETER' in city.upper():
#         return 'ST PETERBURG'
#     elif 'MOSC' in city.upper() or 'MOSK' in city.upper():
#         return 'MOSCOW'
#     else:
#         return city

In [27]:
# dt['city'] = dt['city'].apply(lambda x: convert_city(x))

In [28]:
# import reverse_geocoder as rg

In [29]:
# coords_dict = {}

# for coords in dt[['pos_address_lat', 'pos_address_lon']].dropna().values:
#     coords = tuple(coords)

#     if coords not in coords_dict:
#         coords_dict[coords] = rg.search(coords)[0]['admin1'] 

        
# for coords in dt[['atm_address_lat', 'atm_address_lon']].dropna().values:
#     coords = tuple(coords)

#     if coords not in coords_dict:
#         coords_dict[coords] = rg.search(coords)[0]['admin1']      

In [30]:
# with open('coords_trans.csv', 'w') as f:
#     for coords, place in coords_dict.items():
#         f.write("{}, {}, {}\n".format(coords[0], coords[1], place))

In [31]:
df_coords = pd.read_csv('coords_trans.csv', names=['lat', 'lon', 'city'])

In [32]:
df_coords['city'] = df_coords['city'].apply(lambda x: 'St.-Petersburg' if x == 'Leningrad' else x)

In [33]:
coords_dict = {}

for val in df_coords.values:
    coords_dict[(val[0], val[1])] = val[2]

In [34]:
dt['cities_new'] = ''

In [35]:
for idx in dt.index:

    pos = dt.get_value(idx, 'pos_address_lat'), dt.get_value(idx, 'pos_address_lon')
    if pos in coords_dict:
        dt.set_value(idx, 'cities_new', coords_dict[pos])
        continue
    
    atm = tuple((dt.get_value(idx, 'atm_address_lat'), dt.get_value(idx, 'atm_address_lon')))
    if atm in coords_dict:
        dt.set_value(idx, 'cities_new', coords_dict[atm])
        continue

In [36]:
coords_dict_cutted = {}

for key, value in coords_dict.items():
    coords_dict_cutted[(np.float32(round(key[0], 6)), np.float32(round(key[1], 6)))] = value

In [37]:
for idx in dt[dt['cities_new'] == ''].index:

    pos = dt.get_value(idx, 'pos_address_lat'), dt.get_value(idx, 'pos_address_lon')
    if pos in coords_dict_cutted:
        dt.set_value(idx, 'cities_new', coords_dict_cutted[pos])
        continue
    
    atm = tuple((dt.get_value(idx, 'atm_address_lat'), dt.get_value(idx, 'atm_address_lon')))
    if atm in coords_dict_cutted:
        dt.set_value(idx, 'cities_new', coords_dict_cutted[atm])
        continue

In [38]:
# dt['cities_new'].unique()

In [39]:
# with open('left_cities.txt', 'w') as f:
#     for c in dt[dt['cities_new'] == '']['city'].dropna().unique():
#         f.write(c + '\n')

In [40]:
# add_dict = {}
# counter = 0

# for coords in dt[dt['cities_new'] == ''][['pos_address_lat', 'pos_address_lon']].dropna().drop_duplicates().values:
#     coords = tuple(coords)
#     counter += 1
#     if counter % 100 == 0:
#         print(counter)
    
#     if coords not in add_dict:
#         add_dict[coords] = rg.search(coords)[0]['admin1'] 

In [41]:
# with open('coords_trans.csv', 'a') as f:
#     for coords, place in add_dict_dict.items():
#         f.write("{}, {}, {}\n".format(coords[0], coords[1], place))

In [42]:
# lst = {}

# for coords in dt[['pos_address_lat', 'pos_address_lon']].values:
#     try:
#         lst.append(rg.search(tuple(coords))['admin1'])
#     except:
#         lst.append('')

In [43]:
dt['cities_new'] = dt['cities_new'].apply(lambda x: x[1:].upper())

In [44]:
dt['city'] = dt['city'].apply(lambda x: x.upper() if type(x) != float else x) 

In [45]:
map_old_new_cities = {
    'MOSKVA': 'MOSCOW',
    'MOSCOW': 'MOSCOW',
    'MOSKVA G': 'MOSCOW',
    'SANKT-PETERBU': 'ST.-PETERSBURG',
    'ST PETERSBURG': 'ST.-PETERSBURG',
    'ST-PETERSBURG': 'ST.-PETERSBURG',
    'SAINT PETERSB': 'ST.-PETERSBURG',
    'ST.PETERSBURG': 'ST.-PETERSBURG',
    'ST-PETERBURG': 'ST.-PETERSBURG',
    'ST PETERBURG': 'ST.-PETERSBURG',
    'EKATERINBURG': 'SVERDLOVSK',
    'KRASNODAR': 'KRASNODARSKIY',
    'NVSIBR': 'NOVOSIBIRSK',
    'NOVOSIBIRSK': 'NOVOSIBIRSK',
    'OMSK': 'OMSK',
    'KAZAN':'TATARSTAN',
    'VORONEZH': 'VORONEZJ',
    'MO': 'MOSKOVSKAYA',
    'PODOLSK': 'MOSKOVSKAYA',
    'KRASNOGORSK': 'MOSKOVSKAYA',
    'HIMKI': 'MOSKOVSKAYA',
    'KOROLEV': 'MOSKOVSKAYA',
    'UFA': 'BASHKORTOSTAN',
    'ROSTOV-NA-DON': 'ROSTOV',
    'CHEREPOVETS': 'VOLOGDA',
    'CHEREPOVEC': 'VOLOGDA',
    'SEVASTOPOL': "MISTO SEVASTOPOL'",
    'N NOVGOROD': 'NOVGOROD',
    'NIZHNIY NOVGO': 'NOVGOROD',
    'N.NOVGOROD': 'NOVGOROD',
    'OREL': 'ORJOL'
}

In [46]:
# sorted(dt['cities_new'].unique())

In [47]:
for idx in dt[dt['cities_new'] == ''].index:

    city = dt.get_value(idx, 'city')
    if city in map_old_new_cities:
        dt.set_value(idx, 'cities_new', map_old_new_cities[city])
    else:
        dt.set_value(idx, 'cities_new', city)

In [48]:
dt['city'] = dt['cities_new']
del dt['cities_new']

In [49]:
dt['city'] = dt['city'].factorize()[0].astype(np.int32)

### Фичи денег

In [50]:
df_pays = dt[['transaction_date', 'amount', 'customer_id']].groupby(['customer_id', 'transaction_date'])['amount'].count().to_frame().reset_index()

In [51]:
df_pays.rename(columns={'amount': 'num_of_pays'}, inplace=True)

In [52]:
dt = dt.merge(df_pays, on=['customer_id', 'transaction_date'], how='left')

In [53]:
df_pays = dt[['transaction_date', 'amount', 'customer_id']].groupby(['customer_id', 'transaction_date'])['amount'].sum().to_frame().reset_index()

In [54]:
df_pays.rename(columns={'amount': 'sum_of_pays'}, inplace=True)

In [55]:
dt = dt.merge(df_pays, on=['customer_id', 'transaction_date'], how='left')

In [56]:
del df_pays

### Фичи для даты

In [57]:
dt['weekday'] = dt['transaction_date'].dt.weekday.astype(np.int32)

In [58]:
dt['month'] = dt['transaction_date'].dt.month.astype(np.int32)

In [59]:
dt['is_holiday'] = dt['weekday'].apply(lambda x: 1 if x == 5 or x == 6 else 0)

In [60]:
vacances = [
    np.datetime64('2017-02-23'), 
    np.datetime64('2017-02-24'), 
    np.datetime64('2017-03-08'),
    np.datetime64('2017-05-01'), 
    np.datetime64('2017-05-08'),
    np.datetime64('2017-05-09'),
    np.datetime64('2017-06-12'),
    np.datetime64('2017-11-06')
]

dt['is_vacance'] = dt['transaction_date'].apply(lambda x: 1 if x in vacances else 0)
dt['is_holiday'] = dt['is_holiday'] + dt['is_vacance']

del dt['is_vacance']

In [61]:
def set_length_holiday(day):
    
    if day in (
        np.datetime64('2017-11-06'), 
        np.datetime64('2017-11-05'),
        np.datetime64('2017-11-04'),
        np.datetime64('2017-06-12'),
        np.datetime64('2017-06-11'),
        np.datetime64('2017-06-10'),
        np.datetime64('2017-05-01'),
        np.datetime64('2017-04-29'),
        np.datetime64('2017-04-30'),
    ):
        return 3
    
    elif day in (
        np.datetime64('2017-05-06'),
        np.datetime64('2017-05-07'),
        np.datetime64('2017-05-08'),
        np.datetime64('2017-05-09'),
    ):
        return 4
    elif day == np.datetime64('2017-03-08'):
        return 1
    
    return 2
    

In [63]:
dt['holidays_length'] = dt['transaction_date'].apply(lambda x: set_length_holiday(x))

### Приводим адрес транзакции для pos и atm-транзакций к единообразному виду

In [64]:
atm_df = dt[~dt.atm_address_lon.isnull()].groupby('terminal_id')[['atm_address_lat', 'atm_address_lon']].median()

In [65]:
atm_coords = {}

for idx in atm_df.index:
    atm_coords[idx] = (atm_df.get_value(idx, 'atm_address_lat'), atm_df.get_value(idx, 'atm_address_lon'))

In [66]:
del atm_df

In [67]:
for idx in dt[~dt.atm_address_lon.isnull()].index:
    term_id = dt.get_value(idx, 'terminal_id')
    dt.set_value(idx, 'atm_address_lat', atm_coords[term_id][0])
    dt.set_value(idx, 'atm_address_lon', atm_coords[term_id][1])

In [68]:
dt['is_atm'] = (~dt['atm_address_lat'].isnull()).astype(np.int32)
dt['is_pos'] = (~dt['pos_address_lat'].isnull()).astype(np.int32)

dt['address_lat'] = dt['atm_address_lat'].fillna(0) + dt['pos_address_lat'].fillna(0)
dt['address_lon'] = dt['atm_address_lon'].fillna(0) + dt['pos_address_lon'].fillna(0)

dt.drop(['atm_address_lat','atm_address_lon','pos_address_lat','pos_address_lon'], axis = 1, inplace = True)

# удалим транзакции без адреса
dt.drop(dt[((dt['address_lon'] == 0) & (dt['address_lon'] == 0))].index, axis = 0, inplace = True)

In [69]:
del dt['terminal_id']

### Генерируем признаки is_home, is_work

In [70]:
lat = dt['home_add_lat'] - dt['address_lat']
lon = dt['home_add_lon'] - dt['address_lon']
dt['is_home'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
dt['has_home'] = (~dt['home_add_lon'].isnull()).astype(np.int32)

lat = dt['work_add_lat'] - dt['address_lat']
lon = dt['work_add_lon'] - dt['address_lon']
dt['is_work'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
dt['has_work'] = (~dt['work_add_lon'].isnull()).astype(np.int32)

dt.drop(['work_add_lat','work_add_lon','home_add_lat','home_add_lon'], axis = 1, inplace = True)

### Генерируем категориальный признак для адреса

In [71]:
dt['address'] = dt['address_lat'].apply(lambda x: "%.02f" % x) + ';' + dt['address_lon'].apply(lambda x: "%.02f" % x)
dt['address'] = dt['address'].factorize()[0].astype(np.int32)

### Генерируем несколько абонентских фич

In [72]:
# количество транзакций каждого клиента
dt = dt.merge(dt.groupby('customer_id')['amount'].count().reset_index(name = 'tx'), how = 'left')
dt['tx'] = dt['tx'].astype(np.int32)

dt = dt.merge(dt.groupby(['customer_id','address'])['amount'].count().reset_index(name = 'tx_cust_addr'), how = 'left')
dt['tx_cust_addr'] = dt['tx_cust_addr'].astype(np.int32)

# какая часть транзакций клиента приходится на данный адрес
dt['ratio1'] = dt['tx_cust_addr'] / dt['tx']

## Вспомогательные функции для оценки точности классификатора

In [73]:
def _best(x):
    ret = None
    for col in ys:
        pred = ('pred:%s' % col)
        if pred in x:
            i = (x[pred].idxmax())
            cols = [pred,'address_lat','address_lon']
            if col in x:
                cols.append(col)
            tmp = x.loc[i,cols]
            tmp.rename({
                'address_lat':'%s:add_lat' % col,
                'address_lon':'%s:add_lon' % col,
            }, inplace = True)
            if ret is None:
                ret = tmp
            else:
                ret = pd.concat([ret, tmp])
    return ret

In [74]:
def predict_proba(dt, ys = ['is_home', 'is_work']):
    for col in ys:
        pred = ('pred:%s' % col)
        dt[pred] = model[col].predict_proba(dt[xs])[:,1]
    return dt.groupby('customer_id').apply(_best).reset_index()

In [75]:
def score(dt, ys = ['is_home', 'is_work']):
    dt_ret = predict_proba(dt, ys)
    mean = 0.0
    for col in ys:
        col_mean = dt_ret[col].mean()
        mean += col_mean
    if len(ys) == 2:
        mean = mean / len(ys)
    return mean

### Признаки, на которых будем обучать модель

In [76]:
dt.columns

Index([          'amount',             'city',          'country',
               'currency',      'customer_id',         'is_train',
                    'mcc', 'transaction_date',          'has_car',
                        0,                  1,                  2,
                        3,                  4,                  5,
                        6,                  7,                  8,
                        9,                 10,                 11,
                       12,                 13,                 14,
                       15,                 16,                 17,
                       18,                 19,                 20,
             'was_abroad',      'num_of_pays',      'sum_of_pays',
                'weekday',            'month',       'is_holiday',
        'holidays_length',           'is_atm',           'is_pos',
            'address_lat',      'address_lon',          'is_home',
               'has_home',          'is_work',         'has_wo

In [77]:
xs = list(set(dt.columns) - set(['is_atm', 'is_pos', 'address_lat', 'address_lon', 'is_home', 'has_home',
       'is_work', 'has_work', 'address', 'tx', 'tx_cust_addr', 'transaction_date',
                                'customer_id']))

# ['amount','currency','city', 'country', 'mcc','is_atm',
#       'is_pos','ratio1', 'weekday', 'is_holiday', 'was_abroad',
#      'is_vacance'
#      'num_of_pays', 'sum_of_pays',
#       'holidays_length', 'month',
#       'group'
#       'part', 'percent_near_home'
#      ]
ys = ['is_home', 'is_work']

In [78]:
xs

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 'has_car',
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 'ratio1',
 'sum_of_pays',
 'is_holiday',
 'weekday',
 'was_abroad',
 'city',
 'amount',
 'mcc',
 'num_of_pays',
 'country',
 'currency',
 'is_train',
 'month',
 'holidays_length']

# Создаем классификаторы
**Hint**: можно поигратьcя с гиперпараметрами для лучшего результата :)

In [79]:
model0 = {
    'is_home': xgb.XGBClassifier(n_estimators = 300, n_jobs = 3),
    'is_work': xgb.XGBClassifier(n_estimators = 300, n_jobs = 3),
}

# Обучаем классификаторы

In [80]:
model = {}

# последовательно обучаем два классификатора
for col in ['is_home', 'is_work']:
    
    #выберем для обучение транзакции только тех клиентов из train, у которых хоть в одной транзакции указано место работы/жительства
    cust_train = dt[dt['is_train'] == 1].groupby('customer_id')[col.replace('is_','has_')].max()
    cust_train = cust_train[cust_train > 0].index
    
    #разобъем train на train/valid для валидации
    cust_train, cust_valid = train_test_split(cust_train, test_size = 0.1, shuffle = True, random_state = 2)
    
    train = pd.DataFrame(cust_train, columns = ['customer_id']).merge(dt, how = 'left')
    valid = pd.DataFrame(cust_valid, columns = ['customer_id']).merge(dt, how = 'left')

    print ("Training:", col)
    clf = sklearn.base.clone(model0[col])
    clf.fit(train[xs], train[col], eval_metric = 'logloss', eval_set = [(train[xs], train[col]), (valid[xs], valid[col])], verbose=10)
    model[col] = clf
    print ("Train accuracy:", score(train, ys = [col]))
    print ("Test accuracy:", score(valid, ys = [col]))
    print ()


Training: is_home
[0]	validation_0-logloss:0.660289	validation_1-logloss:0.658774
[10]	validation_0-logloss:0.523509	validation_1-logloss:0.516517
[20]	validation_0-logloss:0.495826	validation_1-logloss:0.488683
[30]	validation_0-logloss:0.486924	validation_1-logloss:0.479656
[40]	validation_0-logloss:0.482567	validation_1-logloss:0.475493
[50]	validation_0-logloss:0.479972	validation_1-logloss:0.472878
[60]	validation_0-logloss:0.477583	validation_1-logloss:0.470726
[70]	validation_0-logloss:0.475585	validation_1-logloss:0.469196
[80]	validation_0-logloss:0.474339	validation_1-logloss:0.46875
[90]	validation_0-logloss:0.472966	validation_1-logloss:0.467723
[100]	validation_0-logloss:0.471905	validation_1-logloss:0.46703
[110]	validation_0-logloss:0.471126	validation_1-logloss:0.466606
[120]	validation_0-logloss:0.470241	validation_1-logloss:0.465815
[130]	validation_0-logloss:0.469324	validation_1-logloss:0.465695
[140]	validation_0-logloss:0.46838	validation_1-logloss:0.465333
[150]	

# Predict

In [81]:
cust_test = dt[dt['is_train'] == 0]['customer_id'].unique()
test = pd.DataFrame(cust_test, columns = ['customer_id']).merge(dt, how = 'left')
test = predict_proba(test)
test.rename(columns = {
        'customer_id':'_ID_',
        'is_home:add_lat': '_HOME_LAT_',
        'is_home:add_lon': '_HOME_LON_',
        'is_work:add_lat': '_WORK_LAT_',
        'is_work:add_lon': '_WORK_LON_'}, inplace = True)
test = test[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]

# Формируем submission-файл

In [82]:
# Заполняем пропуски
submission = submission.merge(test, how = 'left').fillna(0)

# Пишем файл submission
submission.to_csv('baseline-very-simple.csv', index = False)

In [83]:
submission.columns

Index(['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_'], dtype='object')

In [84]:
submission[submission._WORK_LAT_ == 0]

Unnamed: 0,_ID_,_WORK_LAT_,_WORK_LON_,_HOME_LAT_,_HOME_LON_


In [85]:
def check_submit(path_to_csv):
    """
    Dummy checking of submission
    
    :param path_to_csv: path to your submission file
    """
    df = pd.read_csv(path_to_csv)
    assert df.shape == (9997, 5), u'Мало или много строк'
    # несмотря на то, что названия не имеют особого значения, правильный порядк колонок позволит не запутаться в широте-долготе
    assert list(df.columns) == ['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_'], u'Неверные названия столбцов'
    assert np.any(df['_ID_'].duplicated()) == False, u'Одному клиенту соответствует больше одной записи'
    for col_name in df.columns:
        if col_name != '_ID_':
            assert df[col_name].dtype in (np.float, np.int), u'В колонке {col_name} есть NULL'.format(col_name=col_name)
        assert df[col_name].isnull().sum() == 0, u'В колонке {col_name} есть NULL'.format(col_name=col_name)

In [86]:
check_submit('baseline-very-simple.csv')