# Новые фичи
Цифры по mcc  
Погода по месту  
расстояние до дальнейшего соседа  
максимальная продолжительность приобретений в данной точке по дням

# [Boosters] Raiffeisen Data Cup. Baseline
Общий подход:
- Добавляем к каждой транзакции столбец: is_work (если транзакция находится в пределах 0.02 от дома клиента)
- Добавляем к каждой транзакции столбец: is_home (если транзакция находится в пределах 0.02 от работы клиента)
- Обучаем классификатор предсказывающий вероятность (is_home == 1) для транзакции
- Обучаем классификатор предсказывающий вероятность (is_work == 1) для транзакции

Точность определения местоположения:
- для классификатора is_home: ~3x%
- для классификатора is_work: ~2x%
- общая оценка на Public Leaderboard: ???

Примечание
* Требуется Python версии 3.5
* Требуется библиотека xgboost (для обучения использовалась xgboost версии 0.7.post3)
* Требуются файлы: test_set.csv, train_set.csv в одном каталоге с данным скриптом
* Требования к памяти: должно работать с 2Гб свободного RAM
* Время работы: ~3 минуты (тестировалось на процессоре Intel Core i7-4770)

In [17]:
%load_ext autoreload
%autoreload 2

import sys
MODULES_PATH = '../code/'
if MODULES_PATH not in sys.path:
    sys.path.append(MODULES_PATH)
import mfuncs
    
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = -1

import lightgbm as lgb


from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score

import gmaps
API_KEY = 'AIzaSyCG_RL0_kavuEaJAqEN5xXbU4h0VJUbA9M'
gmaps.configure(api_key=API_KEY) # Your Google API key
%pylab inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [18]:
# Определим типы колонок для экономии памяти
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_lat': np.float32,
    'pos_lon': np.float32,
    'atm_lat': np.float32,
    'atm_lon': np.float32,
    'home_lat': np.float32,
    'home_lon': np.float32,
    'work_lat': np.float32,
    'work_lon': np.float32,
}
df_all = pd.read_csv('../data/df_all.csv', dtype=dtypes)

### Обрабатываем дату транзакции и категориальные признаки

In [19]:
df_all['currency'] = df_all['currency'].fillna(-1).astype(np.int32)
df_all['mcc'] = df_all['mcc'].apply(lambda x: int(x.replace(',', ''))).astype(np.int32)
df_all['city'] = df_all['city_name'].factorize()[0].astype(np.int32)
df_all['country'] = df_all['country'].factorize()[0].astype(np.int32)

### Фичи для даты

In [20]:
# удаляем транзакции без даты
df_all = df_all[~df_all['transaction_date'].isnull()]
df_all['transaction_date'] =  pd.to_datetime(df_all['transaction_date'], format='%Y-%m-%d')

In [21]:
df_all['month'] = df_all.transaction_date.dt.month
df_all['day'] = df_all.transaction_date.dt.day
df_all['dayofyear'] = df_all.transaction_date.dt.dayofyear
df_all['dayofweek'] = df_all.transaction_date.dt.dayofweek

In [22]:
# праздники
holidays_df = pd.read_csv('../data/internal/all_holidays.csv', header=None)
holidays_df[0] = pd.to_datetime(holidays_df[0])
holidays_df = holidays_df[holidays_df[0].dt.year == 2017]
holidays = holidays_df[0].dt.dayofyear.values
df_all['is_weekend'] = (df_all.dayofweek >= 6).astype(np.int8)
df_all['is_state_holiday'] = df_all['dayofyear'].isin(holidays).astype(np.int8)
df_all['is_holiday'] = df_all['is_weekend'] | df_all['is_state_holiday']

### Приводим адрес транзакции для pos и atm-транзакций к единообразному виду
Просто объединяем в одну колонку и добавляем фичу - это атм или пос

In [23]:
df_all['is_atm'] = (~df_all['atm_lat'].isnull()).astype(np.int8)
df_all['is_pos'] = (~df_all['pos_lat'].isnull()).astype(np.int8)

df_all['add_lat'] = df_all['atm_lat'].fillna(0) + df_all['pos_lat'].fillna(0)
df_all['add_lon'] = df_all['atm_lon'].fillna(0) + df_all['pos_lon'].fillna(0)

df_all.drop(['atm_lat','atm_lon','pos_lat','pos_lon'], axis=1, inplace=True)

df_all = df_all[~((df_all['add_lon'] == 0) & (df_all['add_lon'] == 0))]

In [24]:
%%time
# грязный хак, чтобы не учить КНН на новом юзере каждый раз
df_all['fake_customer_id'] = (pd.factorize(df_all.customer_id)[0] + 1) * 100

points = df_all[['fake_customer_id', 'add_lat', 'add_lon']].drop_duplicates().values
neigh = NearestNeighbors(2, radius=100000)

# расстояние до уникальных точек
# neigh.fit(np.unique(points, axis=1))
neigh.fit(points) 

distances, indices = neigh.kneighbors(df_all[['fake_customer_id', 'add_lat', 'add_lon']].values)
df_all['distance_to_nearest_point'] = distances[:, 1]
del df_all['fake_customer_id']

CPU times: user 4.5 s, sys: 75.6 ms, total: 4.58 s
Wall time: 4.57 s


In [25]:
# кластерные фичи
df_cluster = pd.read_csv('../data/df_cluster.csv')
df_cluster.reset_index(drop=True, inplace=True)
df_all.reset_index(drop=True, inplace=True)
df_all = pd.concat([df_all, df_cluster.iloc[:, 3:]], axis=1)
df_cluster.head()

Unnamed: 0,customer_id,add_lat,add_lon,cl_label,cl_score,cl_lat,cl_lon
0,0dc0137d280a2a82d2dc89282450ff1b,59.844074,30.179153,0.0,0.933796,59.841676,30.177239
1,0dc0137d280a2a82d2dc89282450ff1b,59.844074,30.179153,0.0,0.933796,59.841676,30.177239
2,0dc0137d280a2a82d2dc89282450ff1b,59.8582,30.229023,1.0,0.551394,59.865354,30.247539
3,0dc0137d280a2a82d2dc89282450ff1b,59.844074,30.179153,0.0,0.933796,59.841676,30.177239
4,0dc0137d280a2a82d2dc89282450ff1b,59.844074,30.179153,0.0,0.933796,59.841676,30.177239


### Генерируем признаки is_home, is_work
TODO: удалить чуваков у которых несколько домов

In [28]:
lat = df_all['home_lat'] - df_all['add_lat']
lon = df_all['home_lon'] - df_all['add_lon']

df_all['is_home'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int8)
df_all['has_home'] = (~df_all['home_lon'].isnull()).astype(np.int8)

lat = df_all['work_lat'] - df_all['add_lat']
lon = df_all['work_lon'] - df_all['add_lon']
df_all['is_work'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int8)
df_all['has_work'] = (~df_all['work_lon'].isnull()).astype(np.int8)

# df_all.drop(['work_lat','work_lon','home_lat','home_lon'], axis=1, inplace=True)

### Генерируем категориальный признак для адреса

In [29]:
df_all['address'] = df_all['add_lat'].apply(lambda x: "%.02f" % x) + ';' + df_all['add_lon'].apply(lambda x: "%.02f" % x)
df_all['address'] = df_all['address'].factorize()[0].astype(np.int32)

### Генерируем абонентские фичи отвечающие за соотношения между точками

In [30]:
df_all = df_all.merge(df_all.groupby('customer_id')['amount'].count().reset_index(name='cid_trans_count'), how='left')
df_all['cid_trans_count'] = df_all['cid_trans_count'].astype(np.int32)

df_all = df_all.merge(df_all.groupby('customer_id')['amount'].agg('sum').reset_index(name='cid_trans_sum'), how='left')
df_all['cid_trans_sum'] = df_all['cid_trans_sum'].astype(np.float32)

In [31]:
def add_count_sum_ratios(df_all, col):
    col_count = 'cid_{}_trans_count'.format(col)
    col_sum = 'cid_{}_trans_sum'.format(col)
    df_ = df_all.groupby(['customer_id', col])['amount'].count().reset_index(name=col_count)
    df_all = df_all.merge(df_, how='left')
    df_all[col_count] = df_all[col_count].astype(np.int32)
    df_all['ratio_{}_count'.format(col)] = df_all[col_count] / df_all['cid_trans_count']
    
    df_ = df_all.groupby(['customer_id', col])['amount'].agg('sum').reset_index(name=col_sum)
    df_all = df_all.merge(df_, how='left')
    df_all[col_sum] = df_all[col_sum].astype(np.float32)
    df_all['ratio_{}_sum'.format(col)] = df_all[col_sum] / df_all['cid_trans_sum']
    return df_all

In [32]:
df_all = add_count_sum_ratios(df_all, 'address')
df_all = add_count_sum_ratios(df_all, 'terminal_id')
df_all = add_count_sum_ratios(df_all, 'mcc')
df_all = add_count_sum_ratios(df_all, 'is_holiday')
df_all = add_count_sum_ratios(df_all, 'city')

## Мои фичи

In [35]:
# добавим признаки после групбая
df_gb = df_all[['customer_id','amount', 'add_lat', 'add_lon']].groupby('customer_id')
coord_stat_df = df_gb.agg(['mean', 'max', 'min'])
coord_stat_df['transactions_per_user'] = df_gb.agg('size')
coord_stat_df.columns = ['_'.join(col).strip() for col in coord_stat_df.columns.values]
coord_stat_df.reset_index(inplace=True)
df_all = pd.merge(df_all, coord_stat_df, on='customer_id', how='left')

In [36]:
cols = ['add_lat', 'add_lon']
types = ['min', 'max', 'mean']
for c in cols:
    for t in types:
        df_all['{}_diff_{}'.format(c, t)] = np.abs(df_all[c] - df_all['{}_{}'.format(c, t)])

In [37]:
# разности 
df_all['lat_diff_cluster_lat'] = np.abs(df_all['add_lat'] - df_all['cl_lat'])
df_all['lon_diff_cluster_lon'] = np.abs(df_all['add_lon'] - df_all['cl_lon'])

## Фичи mcc

In [38]:
# категории
df_all['mcc_str'] = df_all['mcc'].astype(str).str.rjust(4, '0')
df_mcc = pd.read_csv('../data/internal/mcc.csv')
df_mcc = df_mcc.iloc[1:, :3]
df_mcc.columns = ['mcc_str', 'mcc_cat1', 'mcc_cat2']
df_mcc.drop_duplicates(subset=['mcc_str'], inplace=True)
df_mcc['mcc_cat1'] = pd.factorize(df_mcc['mcc_cat1'])[0]
df_mcc['mcc_cat2'] = pd.factorize(df_mcc['mcc_cat2'])[0]
df_mcc.fillna('none', inplace=True)
df_all = pd.merge(df_all, df_mcc, on='mcc_str', how='left')
del df_all['mcc_str']
df_mcc.head()

Unnamed: 0,mcc_str,mcc_cat1,mcc_cat2
1,1,-1,0
2,2,-1,0
3,3,-1,0
4,4,-1,0
5,5,-1,0


In [49]:
df_mcc['mcc_cat1'].fillna(-1, inplace=True)
df_mcc['mcc_cat2'].fillna(-1, inplace=True)

df_all = add_count_sum_ratios(df_all, 'mcc_cat1')
df_all = add_count_sum_ratios(df_all, 'mcc_cat2')

In [51]:
# частота mcc
df_mcc = df_all['mcc'].value_counts(normalize=True).reset_index()
df_mcc.columns = ['mcc', 'mcc_freq']
df_all = pd.merge(df_all, df_mcc, on='mcc', how='left')

In [96]:
df_all = pd.concat([df_all, pd.get_dummies(df_all['mcc'], prefix='mcc')], axis=1)
del df_all['mcc']

In [98]:
df_all = pd.concat([df_all, pd.get_dummies(df_all['mcc_cat1'], prefix='mcc_cat1')], axis=1)
del df_all['mcc_cat1']

df_all = pd.concat([df_all, pd.get_dummies(df_all['mcc_cat2'], prefix='mcc_cat2')], axis=1)
del df_all['mcc_cat2']

In [105]:
# сделаем групбай какие вообще есть mcc у посетителя. Это поможет понять его привычки
mcc_cols = [c for c in df_all.columns if 'mcc' in c and 'cat' not in c]
df_mcc = df_all.groupby('customer_id')[mcc_cols].agg(['max', 'mean'])
df_mcc.columns = ['_'.join(col).strip() for col in df_mcc.columns.values]
df_mcc.reset_index(inplace=True)
df_mcc.head()
df_all = pd.merge(df_all, df_mcc, on='customer_id', how='left')

In [106]:
# сделаем групбай какие вообще есть mcc у посетителя. Это поможет понять его привычки
mcc_cols = [c for c in df_all.columns if 'mcc_cat1' in c]
df_mcc = df_all.groupby('customer_id')[mcc_cols].agg(['max', 'mean'])
df_mcc.columns = ['_'.join(col).strip() for col in df_mcc.columns.values]
df_mcc.reset_index(inplace=True)
df_mcc.head()
df_all = pd.merge(df_all, df_mcc, on='customer_id', how='left')

In [107]:
# сделаем групбай какие вообще есть mcc у посетителя. Это поможет понять его привычки
mcc_cols = [c for c in df_all.columns if 'mcc_cat2' in c]
df_mcc = df_all.groupby('customer_id')[mcc_cols].agg(['max', 'mean'])
df_mcc.columns = ['_'.join(col).strip() for col in df_mcc.columns.values]
df_mcc.reset_index(inplace=True)
df_mcc.head()
df_all = pd.merge(df_all, df_mcc, on='customer_id', how='left')

# Игрушки с адресами

In [86]:
df_all['string'].fillna('', inplace=True)
df_all['string'] = df_all['string'].str.lower()

In [87]:
df_all['has_street'] = df_all['string'].str.contains('улиц').fillna(0).astype(np.int8)
df_all['has_pereul'] = df_all['string'].str.contains('переул').fillna(0).astype(np.int8)
df_all['has_bulvar'] = df_all['string'].str.contains('бульв').fillna(0).astype(np.int8)
df_all['has_prospekt'] = df_all['string'].str.contains('проспект').fillna(0).astype(np.int8)
df_all['has_shosse'] = df_all['string'].str.contains('шосс').fillna(0).astype(np.int8)

df_all['has_torg'] = df_all['string'].str.contains('торгов').astype(np.int8)
df_all['has_bus'] = df_all['string'].str.contains('бизн').astype(np.int8)

# LightGBM

In [108]:
df_all.shape, df_all.columns.duplicated().sum()

((2294265, 1703), 0)

In [89]:
df_all = df_all.loc[:,~df_all.columns.duplicated()]

In [109]:
from sklearn.model_selection import train_test_split

ys = ['is_home', 'is_work']
drop_cols = ['atm_address', 'customer_id', 'pos_address', 'terminal_id', 'transaction_date', 
             'is_home' ,'has_home', 'is_work', 'has_work', 'is_train', 'city_name']
drop_cols += ['work_lat','work_lon','home_lat','home_lon', 'string']

drop_cols += ['pred:is_home', 'pred:is_work']
y_cols = ['is_home', 'is_work']
usecols = df_all.drop(drop_cols, 1, errors='ignore').columns

In [110]:
params = {
    'objective': 'binary',
    'num_leaves': 63,
    'learning_rate': 0.01,
    'metric' : 'binary_logloss',
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'num_threads': 12,
    'verbose': 0,
}

model = {}

In [111]:
y_col = 'is_home'

cust_train = df_all[df_all['is_train']==1].groupby('customer_id')[y_col.replace('is_','has_')].max()
cust_train = cust_train[cust_train > 0].index

cust_train, cust_valid = train_test_split(cust_train, test_size=0.2, shuffle=True, random_state=111)

df_train = pd.DataFrame(cust_train, columns=['customer_id']).merge(df_all, how='left')
df_valid = pd.DataFrame(cust_valid, columns=['customer_id']).merge(df_all, how='left')

lgb_train = lgb.Dataset(df_train[usecols], df_train[y_col])
lgb_valid = lgb.Dataset(df_valid[usecols], df_valid[y_col])

gbm_h = lgb.train(params,
                lgb_train,
                valid_sets=[lgb_valid],
                num_boost_round=2000,
                verbose_eval=30,
                early_stopping_rounds=300)

model[y_col] = gbm_h

Training until validation scores don't improve for 300 rounds.
[30]	valid_0's binary_logloss: 0.596722
[60]	valid_0's binary_logloss: 0.538552
[90]	valid_0's binary_logloss: 0.501204
[120]	valid_0's binary_logloss: 0.476618
[150]	valid_0's binary_logloss: 0.459465
[180]	valid_0's binary_logloss: 0.447449
[210]	valid_0's binary_logloss: 0.438473
[240]	valid_0's binary_logloss: 0.432042
[270]	valid_0's binary_logloss: 0.42723
[300]	valid_0's binary_logloss: 0.423608
[330]	valid_0's binary_logloss: 0.420977
[360]	valid_0's binary_logloss: 0.419002
[390]	valid_0's binary_logloss: 0.417224
[420]	valid_0's binary_logloss: 0.415874
[450]	valid_0's binary_logloss: 0.41505
[480]	valid_0's binary_logloss: 0.414468
[510]	valid_0's binary_logloss: 0.41379
[540]	valid_0's binary_logloss: 0.41319
[570]	valid_0's binary_logloss: 0.412762
[600]	valid_0's binary_logloss: 0.412645
[630]	valid_0's binary_logloss: 0.412296
[660]	valid_0's binary_logloss: 0.41221
[690]	valid_0's binary_logloss: 0.41179
[72

In [112]:
y_col = 'is_work'

cust_train = df_all[df_all['is_train']==1].groupby('customer_id')[y_col.replace('is_','has_')].max()
cust_train = cust_train[cust_train > 0].index

cust_train, cust_valid = train_test_split(cust_train, test_size=0.2, shuffle=True, random_state=111)



df_train = pd.DataFrame(cust_train, columns=['customer_id']).merge(df_all, how='left')
df_valid = pd.DataFrame(cust_valid, columns=['customer_id']).merge(df_all, how='left')

lgb_train = lgb.Dataset(df_train[usecols], df_train[y_col])
lgb_valid = lgb.Dataset(df_valid[usecols], df_valid[y_col])

gbm_w = lgb.train(params,
                lgb_train,
                valid_sets=[lgb_valid],
                num_boost_round=2000,
                verbose_eval=30,
                early_stopping_rounds=300)

model[y_col] = gbm_w

Training until validation scores don't improve for 300 rounds.
[30]	valid_0's binary_logloss: 0.576869
[60]	valid_0's binary_logloss: 0.507223
[90]	valid_0's binary_logloss: 0.462538
[120]	valid_0's binary_logloss: 0.432042
[150]	valid_0's binary_logloss: 0.410963
[180]	valid_0's binary_logloss: 0.396125
[210]	valid_0's binary_logloss: 0.385221
[240]	valid_0's binary_logloss: 0.377783
[270]	valid_0's binary_logloss: 0.372313
[300]	valid_0's binary_logloss: 0.36795
[330]	valid_0's binary_logloss: 0.364977
[360]	valid_0's binary_logloss: 0.362587
[390]	valid_0's binary_logloss: 0.360699
[420]	valid_0's binary_logloss: 0.359398
[450]	valid_0's binary_logloss: 0.358296
[480]	valid_0's binary_logloss: 0.3579
[510]	valid_0's binary_logloss: 0.3574
[540]	valid_0's binary_logloss: 0.356931
[570]	valid_0's binary_logloss: 0.356533
[600]	valid_0's binary_logloss: 0.356384
[630]	valid_0's binary_logloss: 0.356606
[660]	valid_0's binary_logloss: 0.356522
[690]	valid_0's binary_logloss: 0.356314
[7

In [None]:
lgb.plot_importance(gbm_h, max_num_features=15)

In [113]:
def _best(x):
    ret = None
    for col in ys:
        pred = ('pred:%s' % col)
        if pred in x:
            i = (x[pred].idxmax())
            cols = [pred, 'add_lat', 'add_lon']
            if col in x:
                cols.append(col)
            tmp = x.loc[i,cols]
            tmp.rename({
                'add_lat':'%s:add_lat' % col,
                'add_lon':'%s:add_lon' % col,
            }, inplace = True)
            if ret is None:
                ret = tmp
            else:
                ret = pd.concat([ret, tmp])
    return ret


def predict_proba(dt, ys=['is_home', 'is_work']):
    for col in ys:
        pred = ('pred:%s' % col)
        dt[pred] = model[col].predict(dt[usecols])
    return dt.groupby('customer_id').apply(_best).reset_index()

def score(dt, ys=['is_home', 'is_work'], return_df=False):
    dt_ret = predict_proba(dt, ys)
    if return_df:
        return dt_ret
    mean = 0.0
    for col in ys:
        col_mean = dt_ret[col].mean()
        mean += col_mean
    if len(ys) == 2:
        mean = mean / len(ys)
    return mean

In [114]:
print ("Train accuracy:", score(df_train, ys=['is_home']))
print ("Test accuracy:", score(df_valid, ys=['is_home']))

print ("Train accuracy:", score(df_train, ys=['is_work']))
print ("Test accuracy:", score(df_valid, ys=['is_work']))

Train accuracy: 0.5203587009209889
Test accuracy: 0.5232558139534884
Train accuracy: 0.3688802714493456
Test accuracy: 0.34205426356589147


до ohe
Train accuracy: 0.5087251575375666
Test accuracy: 0.5077519379844961
Train accuracy: 0.3637905962190984
Test accuracy: 0.33527131782945735
после ohe
Train accuracy: 0.5070285991274842
Test accuracy: 0.5077519379844961
Train accuracy: 0.3703344643722734
Test accuracy: 0.3391472868217054
после статистиик по всем mcc категориям
Train accuracy: 0.5203587009209889
Test accuracy: 0.5232558139534884
Train accuracy: 0.3688802714493456
Test accuracy: 0.34205426356589147

# Анализ False-Negative

In [None]:
# сколько вообще людей имеют хорошую точку
df_all[(df_all.is_train == 1)].groupby('customer_id')['is_work'].agg('max').mean()

In [None]:
df_pred = score(df_valid, ys=['is_home'], return_df=True)

In [None]:
df_pred.sample(5)

In [None]:
cid = 'bf66305d0ec05abb6e6a6358acb8c2a1'
cid = df_pred[df_pred.is_home == 0].sample(1)['customer_id'].values[0]

df_an = df_all[df_all.customer_id == cid]
center_home = df_an[['home_lat', 'home_lon']].drop_duplicates().values
center_work = df_an[['work_lat', 'work_lon']].drop_duplicates().values


predicted_home = df_pred[df_pred.customer_id == cid][['is_home:add_lat', 'is_home:add_lon']].drop_duplicates().values
predicted_work = df_pred[df_pred.customer_id == cid][['is_work:add_lat', 'is_work:add_lon']].drop_duplicates().values

points_pos = df_an[df_an.is_pos == 1][['add_lat', 'add_lon']].dropna().values
points_atm = df_an[df_an.is_pos == 0][['add_lat', 'add_lon']].dropna().values
print(center_home.shape, center_work.shape, points_pos.shape, points_atm.shape)

# синие - покупки
# красные - банкоматы
gmap = gmaps.Map()
if len(points_pos) > 0:
    gmap.add_layer(gmaps.symbol_layer(points_pos, hover_text='pos', 
                                      fill_color="blue", stroke_color="blue", scale=3))
if len(points_atm) > 0:
    gmap.add_layer(gmaps.symbol_layer(points_atm, hover_text='atm',
                                      fill_color="red", stroke_color="red",scale=3))

if not np.isnan(center_home)[0][0]:
    gmap.add_layer(gmaps.marker_layer(center_home, label='home'))
if not np.isnan(center_work)[0][0]:
    gmap.add_layer(gmaps.marker_layer(center_work, label='work'))

gmap.add_layer(gmaps.marker_layer(predicted_home, label='predicted_home'))
gmap.add_layer(gmaps.marker_layer(predicted_work, label='predicted_work'))
    
gmap

In [None]:
df_an

# Predict

In [115]:
cust_test = df_all[df_all['is_train'] == 0]['customer_id'].unique()
df_test = pd.DataFrame(cust_test, columns = ['customer_id']).merge(df_all, how = 'left')
df_test = predict_proba(df_test)
df_test.rename(columns = {
        'customer_id':'_ID_',
        'is_home:add_lat': '_HOME_LAT_',
        'is_home:add_lon': '_HOME_LON_',
        'is_work:add_lat': '_WORK_LAT_',
        'is_work:add_lon': '_WORK_LON_'}, inplace = True)
df_test = df_test[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]

df_test.head()

Unnamed: 0,_ID_,_WORK_LAT_,_WORK_LON_,_HOME_LAT_,_HOME_LON_
0,00021683ccb416637fe9a4cd35e4606e,55.023354,82.914726,55.041771,82.984329
1,0002d0f8a642272b41c292c12ab6e602,44.033001,42.835999,44.032093,42.837608
2,0004d182d9fede3ba2534b2d5e5ad27e,43.585999,39.723999,43.57243,39.736073
3,0008c2445518c9392cb356c5c3db3392,51.528755,46.04015,51.537647,46.017811
4,000b373cc4969c0be8e0933c08da67e1,56.237175,43.463005,56.232037,43.458107


# Формируем submission-файл

In [117]:
# Заполняем пропуски
df_ = pd.read_csv('../data/test_set.csv', dtype=dtypes, usecols=['customer_id'])
submission = pd.DataFrame(df_['customer_id'].unique(), columns=['_ID_'])

submission = submission.merge(df_test, how='left').fillna(0)
# Пишем файл submission
submission.to_csv('../submissions/base_3_523_342.csv', index=None)