# [Boosters] Raiffeisen Data Cup. Baseline
Общий подход:
- Добавляем к каждой транзакции столбец: is_work (если транзакция находится в пределах 0.02 от дома клиента)
- Добавляем к каждой транзакции столбец: is_home (если транзакция находится в пределах 0.02 от работы клиента)
- Обучаем классификатор предсказывающий вероятность (is_home == 1) для транзакции
- Обучаем классификатор предсказывающий вероятность (is_work == 1) для транзакции

Точность определения местоположения:
- для классификатора is_home: ~3x%
- для классификатора is_work: ~2x%
- общая оценка на Public Leaderboard: ???

Примечание
* Требуется Python версии 3.5
* Требуется библиотека xgboost (для обучения использовалась xgboost версии 0.7.post3)
* Требуются файлы: test_set.csv, train_set.csv в одном каталоге с данным скриптом
* Требования к памяти: должно работать с 2Гб свободного RAM
* Время работы: ~3 минуты (тестировалось на процессоре Intel Core i7-4770)

In [1]:
%load_ext autoreload
%autoreload 2

import sys
MODULES_PATH = '../code/'
if MODULES_PATH not in sys.path:
    sys.path.append(MODULES_PATH)
import mfuncs
    
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
pd.options.display.max_columns = 1000

import lightgbm as lgb


from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score

import gmaps
API_KEY = 'AIzaSyCG_RL0_kavuEaJAqEN5xXbU4h0VJUbA9M'
gmaps.configure(api_key=API_KEY) # Your Google API key
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Определим типы колонок для экономии памяти
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_adress_lat': np.float32,
    'pos_adress_lon': np.float32,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
}

# для экономии памяти будем загружать только часть атрибутов транзакций
usecols_train = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_adress_lat', 'pos_adress_lon', 'atm_address_lat', 'atm_address_lon','home_add_lat','home_add_lon','work_add_lat','work_add_lon']
usecols_test = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_address_lat', 'pos_address_lon', 'atm_address_lat', 'atm_address_lon']

## Читаем train_set, test_set, соединяем в один датасет

In [3]:
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_adress_lat': np.float32,
    'pos_adress_lon': np.float32,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
}

rnm = {
    'atm_address_lat': 'atm_lat',
    'atm_address_lon': 'atm_lon',
    'pos_adress_lat': 'pos_lat',
    'pos_adress_lon': 'pos_lon',
    'pos_address_lat': 'pos_lat',
    'pos_address_lon': 'pos_lon',
    'home_add_lat': 'home_lat',
    'home_add_lon': 'home_lon',
    'work_add_lat': 'work_lat',
    'work_add_lon': 'work_lon',
}

In [4]:
df_train = pd.read_csv('../data/train_set.csv', dtype=dtypes)
df_test = pd.read_csv('../data/test_set.csv', dtype=dtypes)

df_train.rename(columns=rnm, inplace=True)
df_test.rename(columns=rnm, inplace=True)

In [None]:
# удалим чувак с множественными адресами
# print(df_train.shape)
# gb = df_train.groupby('customer_id')['work_lat'].agg('nunique') 
# cid_incorrect = gb[gb == 2].index
# df_train = df_train[~df_train.customer_id.isin(cid_incorrect.values)]
# print(df_train.shape)
# gb = df_train.groupby('customer_id')['home_lat'].agg('nunique') 
# cid_incorrect = gb[gb == 2].index
# df_train = df_train[~df_train.customer_id.isin(cid_incorrect.values)]
# print(df_train.shape)

In [5]:
# соединяем test/train в одном DataFrame
df_train['is_train'] = np.int32(1)
df_test['is_train'] = np.int32(0)
df_all = pd.concat([df_train, df_test])

del df_train, df_test

## Замена городов
Чето слегка ухудшило скор

In [6]:
city_replace = [
    ['peter|stpete|spb', 'SANKT-PETERBU'],
    ['moscow|moskva|mosocw|moskow', 'MOSCOW'],
    ['novosib|nvsibr', 'NOVOSIBIRSK'],
    ['kater', 'EKATERINBURG'],
    ['n.*novg', 'NIZHNIY NOV'],
    ['novg', 'VEL.NOVGOROD'],
    ['erep', 'CHEREPOVETS'],
    ['rasnod', 'KRASNODAR'],
    ['rasno[yj]', 'KRASNOYARSK'],
    ['sama', 'SAMARA'],
    ['kazan', 'KAZAN'],
    ['soch[iy]', 'SOCHI'],
    ['r[yj]aza', 'RYAZAN'],
    ['arza', 'ARZAMAS'],
    ['podol.?sk', 'PODOLSK'],
    ['himki', 'KHIMKI'],
    ['rostov', 'ROSTOV'], # will ovveride Rostov-Na-Don later
    ['rostov.*do', 'ROSTOV-NA-DON'],
    ['ufa', 'UFA'],
    ['^orel|ory[oe]l', 'OREL'],
    ['korol', 'KOROLEV'],
    ['vkar', 'SYKTYVKAR'],
    ['rozavo|rzavo', 'PETROZAVODSK'],
    ['c.*abinsk', 'CHELYABINSK'],
    ['g omsk|^omsk', 'OMSK'],
    ['tomsk', 'TOMSK'],
    ['vorone', 'VORONEZH'],
    ['[yj]arosl', 'YAROSLAVL'],
    ['novoros', 'NOVOROSSIYSK'],
    ['m[yie]t[yi]s', 'MYTISHCHI'],
    ['kal..?ga', 'KALUGA'],
    ['perm', 'PERM'],
    ['volgog|volgrd', 'VOLGOGRAD'],
    ['kirov[^a-z]|kirov$', 'KIROV'],
    ['krasnogo', 'KRASNOGORSK'],
    ['^mo\W+$|^mo$', 'MO'],
    ['irk', 'IRKUTSK'],
    ['balashi', 'BALASHIKHA'],
    ['kaliningrad', 'KALININGRAD'],
    ['anap', 'ANAPA'],
    ['surgut', 'SURGUT'],
    ['odin[tc]', 'ODINTSOVO'],
    ['kemer', 'KEMEROVO'],
    ['t[yuio].?men', 'TYUMEN'],
    ['sarat', 'SARATOV'],
    ['t[uoy]u?la', 'TULA'],
    ['bert', 'LYUBERTSY'],
    ['kotel', 'KOTELNIKI'],
    ['lipet', 'LIPETSK'],
    ['leznodor', 'ZHELEZNODOROZ'],
    ['domod', 'DOMODEDOVO'],
    ['br[yji][a]nsk|braynsk', 'BRYANSK'],
    ['saransk', 'SARANSK'],
    ['znogor', 'ZHELEZNOGORSK'],
    ['smol', 'SMOLENSK'],
    ['sevolo', 'VSEVOLOZHSK'],
    ['p[uy].*kino', 'PUSHKINO'],
    ['re..?tov', 'REUTOV'],
    ['kursk|koursk', 'KURSK'],
    ['belgorod', 'BELGOROD'],
    ['r[yj]azan', 'RYAZAN'],
    ['solnechno', 'SOLNECHNOGORS'],
    ['utorovsk', 'YALUTOROVSK'],
    ['tver', 'TVER'],
    ['barn', 'BARNAUL'],
    ['to.?l..?.?tt[iy]', 'TOLYATTI'],
    ['i[zjg].?evsk', 'IZHEVSK']
]

df_all['city'] = df_all['city'].str.lower()
df_all['city'].fillna('nan_city', inplace=True)
for city_reg, city_name in tqdm(city_replace):
    df_all.loc[df_all['city'].str.contains(city_reg), 'city'] = city_name

100%|██████████| 67/67 [00:58<00:00,  1.22it/s]


### Обрабатываем дату транзакции и категориальные признаки

In [7]:
df_all['currency'] = df_all['currency'].fillna(-1).astype(np.int32)
df_all['mcc'] = df_all['mcc'].apply(lambda x: int(x.replace(',', ''))).astype(np.int32)
df_all['city'] = df_all['city'].factorize()[0].astype(np.int32)
df_all['country'] = df_all['country'].factorize()[0].astype(np.int32)

### Фичи для даты

In [8]:
# удаляем транзакции без даты
df_all = df_all[~df_all['transaction_date'].isnull()]
df_all['transaction_date'] =  pd.to_datetime(df_all['transaction_date'], format='%Y-%m-%d')

In [9]:
df_all['month'] = df_all.transaction_date.dt.month
df_all['day'] = df_all.transaction_date.dt.day
df_all['dayofyear'] = df_all.transaction_date.dt.dayofyear
df_all['dayofweek'] = df_all.transaction_date.dt.dayofweek

In [10]:
# праздники
holidays_df = pd.read_csv('../data/internal/all_holidays.csv', header=None)
holidays_df[0] = pd.to_datetime(holidays_df[0])
holidays_df = holidays_df[holidays_df[0].dt.year == 2017]
holidays = holidays_df[0].dt.dayofyear.values
df_all['is_weekend'] = (df_all.dayofweek >= 6).astype(np.int8)
df_all['is_state_holiday'] = df_all['dayofyear'].isin(holidays).astype(np.int8)
df_all['is_holiday'] = df_all['is_weekend'] | df_all['is_state_holiday']

### Приводим адрес транзакции для pos и atm-транзакций к единообразному виду
Просто объединяем в одну колонку и добавляем фичу - это атм или пос

In [11]:
df_all['is_atm'] = (~df_all['atm_lat'].isnull()).astype(np.int8)
df_all['is_pos'] = (~df_all['pos_lat'].isnull()).astype(np.int8)

df_all['add_lat'] = df_all['atm_lat'].fillna(0) + df_all['pos_lat'].fillna(0)
df_all['add_lon'] = df_all['atm_lon'].fillna(0) + df_all['pos_lon'].fillna(0)

df_all.drop(['atm_lat','atm_lon','pos_lat','pos_lon'], axis=1, inplace=True)

df_all = df_all[~((df_all['add_lon'] == 0) & (df_all['add_lon'] == 0))]

In [12]:
%%time
# грязный хак, чтобы не учить КНН на новом юзере каждый раз
df_all['fake_customer_id'] = (pd.factorize(df_all.customer_id)[0] + 1) * 100

points = df_all[['fake_customer_id', 'add_lat', 'add_lon']].drop_duplicates().values
neigh = NearestNeighbors(2, radius=100000)

# расстояние до уникальных точек
# neigh.fit(np.unique(points, axis=1))
neigh.fit(points) 

distances, indices = neigh.kneighbors(df_all[['fake_customer_id', 'add_lat', 'add_lon']].values)
df_all['distance_to_nearest_point'] = distances[:, 1]
del df_all['fake_customer_id']

CPU times: user 4.65 s, sys: 164 ms, total: 4.82 s
Wall time: 4.82 s


# Кластерные признаки
Сохранены в df_cluster

In [None]:
# фичи с кластерами из тинькова
dfs = []
customers = df_all.customer_id.unique()
np_values = df_all[['customer_id', 'add_lat', 'add_lon']].values

for i in tqdm(range(len(customers))):
    customer = customers[i]
    points = np_values[np_values[:, 0] == customer][:, 1:]
    # оцениваем число кластеров
#     avgs = []
#     max_cluster = min(10,len(points))
#     for i in range(2,max_cluster):
#         kmeans = KMeans(n_clusters=i, random_state=2).fit(points)
#         labels = kmeans.labels_
#         silhouette_avg = silhouette_score(points, labels)
#         avgs.append(silhouette_avg)
        
#     if max_cluster == 2:
#         kmeans = KMeans(n_clusters=2, random_state=2).fit(points)
#         labels = kmeans.labels_
#         silhouette_avg = silhouette_score(points, labels)
#         avgs.append(silhouette_avg)
        
#     n_cluster = avgs.index(max(avgs)) + 2 # так как индексы с 0 а кластеры с 2
    # получаем лучший кластер
    if np.unique(points).size == 2:
        dfs.append(np.zeros((len(points), 4)))
        continue
    n_cluster = 2
    kmeans = KMeans(n_clusters=n_cluster, random_state=2).fit(points)
    #kmeans = AgglomerativeClustering(n_clusters=n_cluster,linkage='average').fit(points)
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_
    silhouette_avg = silhouette_score(points, labels)
    # формируем датафрейм
    sample_silhouette_values = silhouette_samples(points, labels)
#     cluster_df = pd.DataFrame(data=np.vstack((labels, sample_silhouette_values)).T,columns=['label','score'])
#     cluster_df.label = cluster_df.label.astype(np.int32)
#     cluster_df['cluster_center_lat'] = cluster_df.apply(lambda row: centers[int(row['label'])][0], axis=1)
#     cluster_df['cluster_center_lon'] = cluster_df.apply(lambda row: centers[int(row['label'])][1], axis=1)
    arr_label_score = np.vstack((labels, sample_silhouette_values)).T
    arr_label_score = np.hstack([arr_label_score, centers[labels]])
    dfs.append(arr_label_score)

In [None]:
df_cluster = pd.DataFrame(np.vstack(dfs), columns=['cl_label','cl_score', 'cl_lat', 'cl_lon'])
df_all.reset_index(inplace=True, drop=True)
df_all = pd.concat([df_all, df_cluster], axis=1)

In [None]:
df_all.to_csv('../data/df_all_1.csv', index=None)

In [None]:
df_all = pd.read_csv('../data/df_all_1.csv')
df_all[['customer_id', 'add_lat', 'add_lon', 'cl_label', 
        'cl_score', 'cl_lat', 'cl_lon']].to_csv('../data/df_cluster.csv', index=None)
df_all.head()

## загружаем кластерные признаки

In [13]:
df_cluster = pd.read_csv('../data/df_cluster.csv')
df_cluster.reset_index(drop=True, inplace=True)
df_cluster.head()

Unnamed: 0,customer_id,add_lat,add_lon,cl_label,cl_score,cl_lat,cl_lon
0,0dc0137d280a2a82d2dc89282450ff1b,59.844074,30.179153,0.0,0.933796,59.841676,30.177239
1,0dc0137d280a2a82d2dc89282450ff1b,59.844074,30.179153,0.0,0.933796,59.841676,30.177239
2,0dc0137d280a2a82d2dc89282450ff1b,59.8582,30.229023,1.0,0.551394,59.865354,30.247539
3,0dc0137d280a2a82d2dc89282450ff1b,59.844074,30.179153,0.0,0.933796,59.841676,30.177239
4,0dc0137d280a2a82d2dc89282450ff1b,59.844074,30.179153,0.0,0.933796,59.841676,30.177239


In [14]:
df_all.reset_index(drop=True, inplace=True)

In [15]:
df_all = pd.concat([df_all, df_cluster.iloc[:, 3:]], axis=1)

### Генерируем признаки is_home, is_work
TODO: удалить чуваков у которых несколько домов

In [16]:
lat = df_all['home_lat'] - df_all['add_lat']
lon = df_all['home_lon'] - df_all['add_lon']

df_all['is_home'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int8)
df_all['has_home'] = (~df_all['home_lon'].isnull()).astype(np.int8)

lat = df_all['work_lat'] - df_all['add_lat']
lon = df_all['work_lon'] - df_all['add_lon']
df_all['is_work'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int8)
df_all['has_work'] = (~df_all['work_lon'].isnull()).astype(np.int8)

# df_all.drop(['work_lat','work_lon','home_lat','home_lon'], axis=1, inplace=True)

### Генерируем категориальный признак для адреса

In [17]:
df_all['address'] = df_all['add_lat'].apply(lambda x: "%.02f" % x) + ';' + df_all['add_lon'].apply(lambda x: "%.02f" % x)
df_all['address'] = df_all['address'].factorize()[0].astype(np.int32)

### Генерируем несколько абонентских фич

In [18]:
# количество транзакций каждого клиента
df_all = df_all.merge(df_all.groupby('customer_id')['amount'].count().reset_index(name='cid_trans_count'), how='left')
df_all['cid_trans_count'] = df_all['cid_trans_count'].astype(np.int32)

df_all = df_all.merge(df_all.groupby(['customer_id','address'])['amount'].count().reset_index(name='cid_add_trans_count'), 
                      how='left')
df_all['cid_add_trans_count'] = df_all['cid_add_trans_count'].astype(np.int32)

# какая часть транзакций клиента приходится на данный адрес
# TODO: БОЛЬШЕ ТАКИХ ФИЧ
df_all['ratio1'] = df_all['cid_add_trans_count'] / df_all['cid_trans_count']

## Мои фичи

In [19]:
# добавим признаки после групбая
df_gb = df_all[['customer_id','amount', 'add_lat', 'add_lon']].groupby('customer_id')
coord_stat_df = df_gb.agg(['mean', 'max', 'min'])
coord_stat_df['transactions_per_user'] = df_gb.agg('size')
coord_stat_df.columns = ['_'.join(col).strip() for col in coord_stat_df.columns.values]
coord_stat_df.reset_index(inplace=True)
df_all = pd.merge(df_all, coord_stat_df, on='customer_id', how='left')

In [20]:
cols = ['add_lat', 'add_lon']
types = ['min', 'max', 'mean']
for c in cols:
    for t in types:
        df_all['{}_diff_{}'.format(c, t)] = np.abs(df_all[c] - df_all['{}_{}'.format(c, t)])

In [21]:
# разности 
df_all['lat_diff_cluster_lat'] = np.abs(df_all['add_lat'] - df_all['cl_lat'])
df_all['lon_diff_cluster_lon'] = np.abs(df_all['add_lon'] - df_all['cl_lon'])

In [24]:
# частота mcc
df_mcc = df_all['mcc'].value_counts(normalize=True).reset_index()
df_mcc.columns = ['mcc', 'mcc_freq']
df_all = pd.merge(df_all, df_mcc, on='mcc', how='left')

In [25]:
df_all = pd.concat([df_all, pd.get_dummies(df_all['mcc'], prefix='mcc')], axis=1)
del df_all['mcc']

In [27]:
# сделаем групбай какие вообще есть mcc у посетителя. Это поможет понять его привычки
mcc_cols = [c for c in df_all.columns if 'mcc' in c]
df_mcc = df_all.groupby('customer_id')[mcc_cols].agg(['max', 'mean'])
df_mcc.columns = ['_'.join(col).strip() for col in df_mcc.columns.values]
df_mcc.reset_index(inplace=True)
df_mcc.head()
df_all = pd.merge(df_all, df_mcc, on='customer_id', how='left')

In [28]:
df_all.shape

(2294265, 848)

# LightGBM

In [29]:
df_all = df_all.loc[:,~df_all.columns.duplicated()]

In [30]:
from sklearn.model_selection import train_test_split

ys = ['is_home', 'is_work']
drop_cols = ['atm_address', 'customer_id', 'pos_address', 'terminal_id', 'transaction_date', 
             'is_home' ,'has_home', 'is_work', 'has_work', 'is_train']
drop_cols += ['work_lat','work_lon','home_lat','home_lon']

drop_cols += ['pred:is_home', 'pred:is_work']
y_cols = ['is_home', 'is_work']
usecols = df_all.drop(drop_cols, 1, errors='ignore').columns

In [31]:
params = {
    'objective': 'binary',
    'num_leaves': 63,
    'learning_rate': 0.01,
    'metric' : 'binary_logloss',
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'num_threads': 12,
    'verbose': 0,
}

model = {}

In [32]:
y_col = 'is_home'

cust_train = df_all[df_all['is_train']==1].groupby('customer_id')[y_col.replace('is_','has_')].max()
cust_train = cust_train[cust_train > 0].index

cust_train, cust_valid = train_test_split(cust_train, test_size=0.2, shuffle=True, random_state=111)

df_train = pd.DataFrame(cust_train, columns=['customer_id']).merge(df_all, how='left')
df_valid = pd.DataFrame(cust_valid, columns=['customer_id']).merge(df_all, how='left')

lgb_train = lgb.Dataset(df_train[usecols], df_train[y_col])
lgb_valid = lgb.Dataset(df_valid[usecols], df_valid[y_col])

gbm_h = lgb.train(params,
                lgb_train,
                valid_sets=[lgb_valid],
                num_boost_round=2000,
                verbose_eval=30,
                early_stopping_rounds=300)

model[y_col] = gbm_h

Training until validation scores don't improve for 300 rounds.
[30]	valid_0's binary_logloss: 0.598527
[60]	valid_0's binary_logloss: 0.539956
[90]	valid_0's binary_logloss: 0.502799
[120]	valid_0's binary_logloss: 0.477792
[150]	valid_0's binary_logloss: 0.46051
[180]	valid_0's binary_logloss: 0.448566
[210]	valid_0's binary_logloss: 0.439775
[240]	valid_0's binary_logloss: 0.43326
[270]	valid_0's binary_logloss: 0.428643
[300]	valid_0's binary_logloss: 0.425201
[330]	valid_0's binary_logloss: 0.422908
[360]	valid_0's binary_logloss: 0.421215
[390]	valid_0's binary_logloss: 0.419915
[420]	valid_0's binary_logloss: 0.418856
[450]	valid_0's binary_logloss: 0.417791
[480]	valid_0's binary_logloss: 0.417021
[510]	valid_0's binary_logloss: 0.416312
[540]	valid_0's binary_logloss: 0.415716
[570]	valid_0's binary_logloss: 0.415132
[600]	valid_0's binary_logloss: 0.414823
[630]	valid_0's binary_logloss: 0.414646
[660]	valid_0's binary_logloss: 0.41439
[690]	valid_0's binary_logloss: 0.414051


In [33]:
y_col = 'is_work'

cust_train = df_all[df_all['is_train']==1].groupby('customer_id')[y_col.replace('is_','has_')].max()
cust_train = cust_train[cust_train > 0].index

cust_train, cust_valid = train_test_split(cust_train, test_size=0.2, shuffle=True, random_state=111)



df_train = pd.DataFrame(cust_train, columns=['customer_id']).merge(df_all, how='left')
df_valid = pd.DataFrame(cust_valid, columns=['customer_id']).merge(df_all, how='left')

lgb_train = lgb.Dataset(df_train[usecols], df_train[y_col])
lgb_valid = lgb.Dataset(df_valid[usecols], df_valid[y_col])

gbm_w = lgb.train(params,
                lgb_train,
                valid_sets=[lgb_valid],
                num_boost_round=2000,
                verbose_eval=30,
                early_stopping_rounds=300)

model[y_col] = gbm_w

Training until validation scores don't improve for 300 rounds.
[30]	valid_0's binary_logloss: 0.578386
[60]	valid_0's binary_logloss: 0.508714
[90]	valid_0's binary_logloss: 0.462802
[120]	valid_0's binary_logloss: 0.431328
[150]	valid_0's binary_logloss: 0.410159
[180]	valid_0's binary_logloss: 0.395098
[210]	valid_0's binary_logloss: 0.383669
[240]	valid_0's binary_logloss: 0.376295
[270]	valid_0's binary_logloss: 0.371148
[300]	valid_0's binary_logloss: 0.366909
[330]	valid_0's binary_logloss: 0.364057
[360]	valid_0's binary_logloss: 0.361737
[390]	valid_0's binary_logloss: 0.359343
[420]	valid_0's binary_logloss: 0.357944
[450]	valid_0's binary_logloss: 0.356213
[480]	valid_0's binary_logloss: 0.355085
[510]	valid_0's binary_logloss: 0.354576
[540]	valid_0's binary_logloss: 0.353907
[570]	valid_0's binary_logloss: 0.353471
[600]	valid_0's binary_logloss: 0.353224
[630]	valid_0's binary_logloss: 0.353191
[660]	valid_0's binary_logloss: 0.353277
[690]	valid_0's binary_logloss: 0.3534

In [None]:
lgb.plot_importance(gbm_h, max_num_features=15)

In [34]:
def _best(x):
    ret = None
    for col in ys:
        pred = ('pred:%s' % col)
        if pred in x:
            i = (x[pred].idxmax())
            cols = [pred, 'add_lat', 'add_lon']
            if col in x:
                cols.append(col)
            tmp = x.loc[i,cols]
            tmp.rename({
                'add_lat':'%s:add_lat' % col,
                'add_lon':'%s:add_lon' % col,
            }, inplace = True)
            if ret is None:
                ret = tmp
            else:
                ret = pd.concat([ret, tmp])
    return ret


def predict_proba(dt, ys=['is_home', 'is_work']):
    for col in ys:
        pred = ('pred:%s' % col)
        dt[pred] = model[col].predict(dt[usecols])
    return dt.groupby('customer_id').apply(_best).reset_index()

def score(dt, ys=['is_home', 'is_work'], return_df=False):
    dt_ret = predict_proba(dt, ys)
    if return_df:
        return dt_ret
    mean = 0.0
    for col in ys:
        col_mean = dt_ret[col].mean()
        mean += col_mean
    if len(ys) == 2:
        mean = mean / len(ys)
    return mean

In [35]:
print ("Train accuracy:", score(df_train, ys=['is_home']))
print ("Test accuracy:", score(df_valid, ys=['is_home']))

print ("Train accuracy:", score(df_train, ys=['is_work']))
print ("Test accuracy:", score(df_valid, ys=['is_work']))

Train accuracy: 0.5227823557925352
Test accuracy: 0.5184108527131783
Train accuracy: 0.3647600581677169
Test accuracy: 0.34205426356589147


# Анализ False-Negative

In [None]:
# сколько вообще людей имеют хорошую точку
df_all[(df_all.is_train == 1)].groupby('customer_id')['is_work'].agg('max').mean()

In [None]:
df_pred = score(df_valid, ys=['is_home'], return_df=True)

In [None]:
df_pred.sample(5)

In [None]:
cid = 'bf66305d0ec05abb6e6a6358acb8c2a1'
cid = df_pred[df_pred.is_home == 0].sample(1)['customer_id'].values[0]

df_an = df_all[df_all.customer_id == cid]
center_home = df_an[['home_lat', 'home_lon']].drop_duplicates().values
center_work = df_an[['work_lat', 'work_lon']].drop_duplicates().values


predicted_home = df_pred[df_pred.customer_id == cid][['is_home:add_lat', 'is_home:add_lon']].drop_duplicates().values
predicted_work = df_pred[df_pred.customer_id == cid][['is_work:add_lat', 'is_work:add_lon']].drop_duplicates().values

points_pos = df_an[df_an.is_pos == 1][['add_lat', 'add_lon']].dropna().values
points_atm = df_an[df_an.is_pos == 0][['add_lat', 'add_lon']].dropna().values
print(center_home.shape, center_work.shape, points_pos.shape, points_atm.shape)

# синие - покупки
# красные - банкоматы
gmap = gmaps.Map()
if len(points_pos) > 0:
    gmap.add_layer(gmaps.symbol_layer(points_pos, hover_text='pos', 
                                      fill_color="blue", stroke_color="blue", scale=3))
if len(points_atm) > 0:
    gmap.add_layer(gmaps.symbol_layer(points_atm, hover_text='atm',
                                      fill_color="red", stroke_color="red",scale=3))

if not np.isnan(center_home)[0][0]:
    gmap.add_layer(gmaps.marker_layer(center_home, label='home'))
if not np.isnan(center_work)[0][0]:
    gmap.add_layer(gmaps.marker_layer(center_work, label='work'))

gmap.add_layer(gmaps.marker_layer(predicted_home, label='predicted_home'))
gmap.add_layer(gmaps.marker_layer(predicted_work, label='predicted_work'))
    
gmap

In [None]:
df_an

# Predict

In [36]:
cust_test = df_all[df_all['is_train'] == 0]['customer_id'].unique()
df_test = pd.DataFrame(cust_test, columns = ['customer_id']).merge(df_all, how = 'left')
df_test = predict_proba(df_test)
df_test.rename(columns = {
        'customer_id':'_ID_',
        'is_home:add_lat': '_HOME_LAT_',
        'is_home:add_lon': '_HOME_LON_',
        'is_work:add_lat': '_WORK_LAT_',
        'is_work:add_lon': '_WORK_LON_'}, inplace = True)
df_test = df_test[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]

df_test.head()

Unnamed: 0,_ID_,_WORK_LAT_,_WORK_LON_,_HOME_LAT_,_HOME_LON_
0,00021683ccb416637fe9a4cd35e4606e,55.027,82.917999,55.038212,82.977364
1,0002d0f8a642272b41c292c12ab6e602,44.034,42.835999,44.032001,42.837002
2,0004d182d9fede3ba2534b2d5e5ad27e,43.585999,39.723999,43.586273,39.724274
3,0008c2445518c9392cb356c5c3db3392,51.528755,46.04015,51.537647,46.017811
4,000b373cc4969c0be8e0933c08da67e1,56.317917,43.925426,56.232037,43.458107


# Формируем submission-файл

In [37]:
# Заполняем пропуски
df_ = pd.read_csv('../data/test_set.csv', dtype=dtypes, usecols=['customer_id'])
submission = pd.DataFrame(df_['customer_id'].unique(), columns=['_ID_'])

submission = submission.merge(df_test, how='left').fillna(0)
# Пишем файл submission
submission.to_csv('../submissions/base_3_518_342.csv', index=None)