In [1]:
import pandas as pd
import numpy as np
import datetime
import re, time, json, gc, sys

import xgboost as xgb
import sklearn

from sklearn.model_selection import train_test_split
import reverse_geocoder as revgeo

import urllib.request as ur
from geopy.geocoders import Nominatim

In [2]:
# типы колонок для экономии памяти
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_adress_lat': np.float32,
    'pos_adress_lon': np.float32,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
}


## Читаем train_set, test_set, соединяем в один датасет

In [3]:
train = pd.read_csv('data/train_set.csv', dtype = dtypes)
train.rename(columns = {'pos_adress_lat': 'pos_address_lat', 'pos_adress_lon': 'pos_address_lon'}, inplace = True)

test = pd.read_csv('data/test_set.csv', dtype = dtypes)
submission = pd.DataFrame(test['customer_id'].unique(), columns = ['_ID_'])

# соединяем test/train в один DataFrame
train['is_train'] = np.int32(1)
test['is_train'] = np.int32(0)
df = pd.concat([train, test])

del train, test

### Категориальные фичи для городов

In [4]:
pattern = re.compile("[^ a-zA-Z]")
def filter_city(city: str, min_len: int=2) -> list:
    t = pattern.sub(r"", city.lower())
    return " ".join([c for c in t.strip().split() if len(c) > min_len])

In [5]:
%%time
df.city = df.city.apply(lambda x: str.lower(x) if x is not np.NaN else "")
df.city = df.city.apply(filter_city)
df['city'] = df['city'].factorize()[0].astype(np.int32)

Wall time: 8.26 s


In [6]:
df.sample(5)

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,is_train,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
339073,2.520027,,,,2,RUS,643.0,330c24aa8d7954730112d8a15522fc2d,,,0,5411,2-2 DZERZHINSKOGO PR-TNOVOSIBIRSK630000 RUSRUS,55.0439,82.953415,552831749d9324d350029b63f5129d14,2017-05-13,,
437298,2.50316,,,,5,RUS,643.0,7a202de063223f507979320d96ae1209,,,0,5812,"-1, SCHELKOVSKOEMOSKVA105122 RUSRUS""",55.816242,37.784416,2e793c79923e9a9596b73838fe77f828,2017-09-06,,
1092759,3.1096,,,,3809,UA,980.0,48ba1e85f5c9bc6fbc685cccc4c51ce2,,,0,6011,,,,64f3b6ddcb5e927c788bf0dc71999bc2,2017-08-20,,
408315,2.17046,,,,1,RUS,643.0,259a2646186e3eb50f2be6c95cfc2cfb,,,0,5411,137 MOSKOVSKIY PR-TSANKT-PETERBU190000 RUSRUS,59.881889,30.315697,c4eee4e149ac4a953e01e1f8af69a90f,2017-08-28,,
324453,2.646865,,,,276,RUS,643.0,087f6bf2fb724c91b98a0b6d16bb6865,55.743999,37.778,1,5812,NOSOVIKHINSKOYE SH.REUTOV143969 RUSRUS,55.744431,37.84911,ce2db7b7f6a7b4b663f0c50f05775257,2017-10-28,,


### Обрабатываем дату транзакции и категориальные признаки

In [7]:
%%time
df['currency'] = df['currency'].fillna(-1).astype(np.int32)
df['mcc'] = df['mcc'].apply(lambda x: int(x.replace(',', ''))).astype(np.int32)
df['country'] = df['country'].factorize()[0].astype(np.int32)

# удаляем транзакции без даты
df.drop(df[df['transaction_date'].isnull()].index, axis = 0, inplace = True)
df['transaction_date'] = df['transaction_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

Wall time: 48.3 s


In [8]:
df.sample(5)

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,is_train,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
625499,3.309968,,,,5,0,643,c361ff802759d464efba33e0b2bd3022,,,0,5411,"147-1-2, PALEKHSKAYAMOSKVA129347 RUSRUS",55.879101,37.712822,2ef051659d4e349e67e008841a3abe2e,2017-07-22,,
729126,2.162127,,,,3,0,643,cc4fa790bab00bd050dfaa7b1cba7c20,55.681999,37.542,1,5499,30B BOLSHAYA CHERKIZOVSKAYAMOSCOW101000 RUSRUS,55.799835,37.740902,430c804e199bc06d1cc7aa8eea1ba73a,2017-10-07,55.655998,37.523998
1081003,2.701486,"Долгопрудный, пр. Лихачевский, д. 64",,,8,0,643,97a53a67cbbb7547329914f0f0925eca,,,0,6011,,,,5d0412d27f1904c32a69371c11b68b2a,2017-06-10,,
304972,2.355235,,,,223,0,643,a47488072e942ff35ae09eeb6929c673,55.799,38.471001,1,5541,NNOGINSK142400 RUSRUS,55.85535,38.441189,1804c7820d1851c1443bb7417c7b2149,2017-07-22,,
901683,1.656208,,,,134,0,643,1f08f8ce6827b7d38c44b6b53e9f4135,,,0,5814,1 BETANKURA STRN.NOVGOROD603000 RUSRUS,56.338295,43.954891,91df41bf71d616d328275ee9106672ac,2017-07-20,,


### Фичи для даты

In [9]:
df['weekday'] = df['transaction_date'].dt.weekday.astype(np.int32)

In [10]:
df.sample(5)

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,is_train,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon,weekday
1202415,4.307857,"Самара, ул. Гагарина, д. 109",53.199001,50.202,67,0,643,4fda294413690440be8da61bf6964455,53.216999,50.164001,1,6011,,,,d7fe17094fedb96008f087a38bd26d79,2017-02-18,55.808998,37.519001,5
904978,3.064329,,,,270,0,643,c1f7029422460dd021345db99afc6ad6,55.810001,37.409,1,5261,LAGOVSKOE S.P.SLAGOVSKOE S.142184 RUSRUS,55.309696,37.5406,950c650d83a6db29ef0eee5a10c4aac1,2017-04-30,55.745998,37.651001,6
95012,2.5814,,,,104,0,643,2604b2dd1153b3b6ed00e36d95415db5,59.112999,37.988998,1,5921,14 YUBILEYNAYA STRCHEREPOVETS162600 RUSRUS,59.117268,37.992489,76212462abc0fa09e1c648833491e04f,2017-10-20,,,4
361785,3.062847,,,,3,0,643,8200698319276bcd457a259272843c8f,55.754002,37.778,1,5921,"SAVVINSKAYA NAB., 19MOSCOW119435 RUSRUS",55.734222,37.564682,2105135786b6bb2a56c4cada498fa22b,2017-07-03,,,0
122493,2.901114,,,,16,0,643,6ed32308354b24b955562699802c4e71,,,0,5411,178 STASOVA STR BLD TRK GALAKRASNODAR350000 ...,45.029842,39.046127,9210b88591eb1fe056c9dbe0533390a6,2017-07-02,,,6


### Приводим адрес транзакции для pos и atm-транзакций к единообразному виду

In [11]:
%%time
df['is_atm'] = (~df['atm_address_lat'].isnull()).astype(np.int32)
df['is_pos'] = (~df['pos_address_lat'].isnull()).astype(np.int32)

df['address_lat'] = df['atm_address_lat'].fillna(0) + df['pos_address_lat'].fillna(0)
df['address_lon'] = df['atm_address_lon'].fillna(0) + df['pos_address_lon'].fillna(0)

df.drop(['atm_address_lat','atm_address_lon','pos_address_lat','pos_address_lon'], axis = 1, inplace = True)

# удалим транзакции без адреса
df.drop(df[((df['address_lon'] == 0) & (df['address_lon'] == 0))].index, axis = 0, inplace = True)

Wall time: 1.79 s


In [12]:
df.sample(5)

Unnamed: 0,amount,atm_address,city,country,currency,customer_id,home_add_lat,home_add_lon,is_train,mcc,pos_address,terminal_id,transaction_date,work_add_lat,work_add_lon,weekday,is_atm,is_pos,address_lat,address_lon
314076,2.65937,,158,0,643,333a9a49e305d9e1f857960facab45f7,43.911999,39.332001,1,5732,"4, TSIOLKOVSKOGOSOCHI354200 RUSRUS",e2affa59088091aeef608044b08503ba,2017-07-21,43.587002,39.726002,4,0,1,43.91906,39.317429
250278,2.160274,,104,0,643,1ce9847635f136fa5d542b061a71c4f5,59.123001,37.887001,1,5999,4 BARDINA STRCHEREPOVETS162600 RUSRUS,a084cea2ee0c9e484b47388a39de8c8c,2017-07-22,,,5,0,1,59.126484,37.882259
1150531,4.484488,"Новомосковск, ул. Орджоникидзе, д. 2а",222,0,643,6ce4cb5c1908c029059d9cfcdb840a62,55.804001,37.393002,1,6011,,7654bc12c79ca62ab979edc2ee98ecea,2017-04-06,55.616001,36.862999,3,1,0,54.035999,38.264999
883560,2.613703,,3,0,643,b0c2948315f70a2083346d9be9d7e3a7,55.854,37.564999,1,5945,89 DMITROVSKOE SHMOSCOW121059 RUSRUS,4b241ca30138231e5eb112ce81a08a6e,2017-10-28,,,5,0,1,55.863297,37.546581
98199,2.257897,,104,0,643,c6a12f71a7f2d79d9f458ddb507c89d0,59.145,37.966999,1,5411,"118, LENINACHEREPOVETS162610 RUSRUS",8dc9600234670193a4ff6d97454b1860,2017-08-27,,,6,0,1,59.129738,37.889091


### Генерируем признаки is_home, is_work

In [13]:
def custom_metrics(lat, lon, eps=0.02):
    return (np.sqrt((lat**2) + (lon**2)) <= eps).astype(np.int32)

In [14]:
%%time
lat_h = df['home_add_lat'] - df['address_lat']
lon_h = df['home_add_lon'] - df['address_lon']
df['is_home'] = custom_metrics(lat_h, lon_h)
df['has_home'] = (~df['home_add_lon'].isnull()).astype(np.int32)

lat_w = df['work_add_lat'] - df['address_lat']
lon_w = df['work_add_lon'] - df['address_lon']
df['is_work'] = custom_metrics(lat_w, lon_w)
df['has_work'] = (~df['work_add_lon'].isnull()).astype(np.int32)

df.drop(['work_add_lat','work_add_lon','home_add_lat','home_add_lon'], axis = 1, inplace = True)

Wall time: 551 ms


### Транзакции в день

In [15]:
%%time
df = df.merge(df.groupby(['customer_id', 'transaction_date'])['amount'].count().reset_index(name = 'trans_in_day'), how = 'left')

Wall time: 3.4 s


### Обработка адресов терминалов и категориальные фичи для них

In [16]:
%%time
df['address'] = df['address_lat'].apply(lambda x: "%.02f" % x) + ';' + df['address_lon'].apply(lambda x: "%.02f" % x)
df['address'] = df['address'].factorize()[0].astype(np.int32)
df.sample(5)

Wall time: 4.78 s


### Генерируем несколько абонентских фич

In [20]:
%%time
# количество транзакций каждого клиента
df = df.merge(df.groupby('customer_id')['amount'].count().reset_index(name = 'customer_trans'), how = 'left')
df['customer_trans'] = df['customer_trans'].astype(np.int32)

df = df.merge(df.groupby(['customer_id','address'])['amount'].count().reset_index(name = 'customer_trans_addr'), how = 'left')
df['customer_trans_addr'] = df['customer_trans_addr'].astype(np.int32)

# какая часть транзакций клиента приходится на данный адрес
df['ratio1'] = df['customer_trans_addr'] / df['customer_trans']

Wall time: 6.7 s


In [21]:
df.sample(5)

Unnamed: 0,amount,atm_address,city,country,currency,customer_id,is_train,mcc,pos_address,terminal_id,...,address_lon,is_home,has_home,is_work,has_work,trans_in_day,address,customer_trans,customer_trans_addr,ratio1
1292732,1.781931,,3,0,643,a1a71450df84f87fc8abe6a24fa4cac0,0,4111,KOMSOMOLSKAYA PL. D.5MOSCOW107140 RUSRUS,91cacef394ae3a4b6518883cb9284160,...,37.657352,0,0,0,0,1,69,537,77,0.143389
730928,2.6985,,0,0,643,56d03e98042d6cde5c028b581726fc6f,1,5261,,4d9db6fefdb843b31640e1af7ddf0d7a,...,30.301796,0,1,0,1,1,243,85,15,0.176471
1514886,2.773227,,3,0,643,8a553681c5dba3f88ab95ce9223988cd,0,5812,15 NOVYY ARBAT STRMOSCOW119019 RUSRUS,9134284de9197cebba1db3e4bdc8a081,...,37.592236,0,0,0,0,2,682,64,6,0.09375
1178607,3.350161,,3,0,643,3dfc208ef462606d0c8ae37a8a4dd923,0,5411,1 SIRENEVYY BLVD BLD 5MOSCOW141000 RUSRUS,6dd90251e5c567d2abe033889f2cc0f4,...,37.766235,0,0,0,0,1,2331,87,14,0.16092
1107189,1.937711,,2472,0,643,67dbe8c7dda2b42060682a8d19b4da8c,0,5411,17 DOBROSLAVINA PR-TDYATKOVO242600 RUSRUS,ec7790a916ef9a71d97c9d47dc8c3446,...,34.341263,0,0,0,0,2,13400,140,31,0.221429


### Нахождение района через обратную геолокацию

In [22]:
%%time
coordinates = tuple(np.hstack((df.address_lat.values.reshape(-1,1), df.address_lon.values.reshape(-1,1))))
coordinates = tuple(map(tuple, coordinates))
coordinates_info = revgeo.search(coordinates, verbose=False)
disctrict_name = list(map(lambda x: (x['name']), coordinates_info))
df['district'] = df['city'].apply(str)+'_'+disctrict_name
df['district'] = df['district'].factorize()[0].astype(np.int16)

Wall time: 15.6 s


In [23]:
df.sample(5)

Unnamed: 0,amount,atm_address,city,country,currency,customer_id,is_train,mcc,pos_address,terminal_id,...,is_home,has_home,is_work,has_work,trans_in_day,address,customer_trans,customer_trans_addr,ratio1,district
1531662,2.129702,,2,0,643,b7a8ead0e6f04ad478872b46796e794b,0,5941,107 VATUTINA STR BLD STTS MENOVOSIBIRSK630024 ...,2f5db832be4ab6dd3b8e3107a6f439df,...,0,0,0,0,2,500,80,4,0.05,4
1378724,1.677971,,1,0,643,ac2a72be12cddbb07b7a3f9a3740e70f,0,5411,154 SEDOVA STR BLD ASANKT-PETERBU190000 RUSRUS,4e590cc94b863d5e63acf1a80285b11e,...,0,0,0,0,1,2671,236,4,0.016949,205
1662148,1.772827,,3,0,643,61e6122eb7ce7d912dfd61da324e18bc,0,5411,"KUTUZOVSKI PR., D.8MOSCOW121248 RUS",d369602ea6116d2cf50a5d894d5d400e,...,0,0,0,0,6,1171,514,108,0.210117,37
406033,2.852081,,3,0,643,df4f3045faee9bbb8c1520e64d80b58b,1,5641,88 KUTUZOVSKIY PR-TMOSCOW141000 RUSRUS,a6794860ce420b7dfeb28b175d1d6351,...,0,1,0,0,1,967,80,3,0.0375,34
231909,1.846101,,125,0,643,53814b7cd2bc49de90cbf133e330e111,1,5499,ZVEREVOY UL D 1 KOR 8GATCHINA188300 47 RUS,4a6f8abd7596d9aeb15c7dbd6d128310,...,0,1,0,0,1,763,52,5,0.096154,403


### Сумма транзакций для клиента для каждого терминала

In [24]:
df = df.merge(df.groupby(['customer_id', 'address'])['amount'].sum().reset_index(name = 'pos_amount'), how = 'left')

### Категориальные фичи для ИД терминала

In [25]:
df['terminal_label'] = df['terminal_id'].factorize()[0].astype(np.int16)

### Подсчет частоты использования терминала

In [26]:
df = df.merge(df.groupby('terminal_id')['address'].count().reset_index(name='terminal_use_freq'), how='left')

## Вспомогательные функции для оценки точности классификатора

In [29]:
def _best(x):
    ret = None
    for col in ys:
        pred = ('pred:%s' % col)
        if pred in x:
            i = (x[pred].idxmax())
            cols = [pred,'address_lat','address_lon']
            if col in x:
                cols.append(col)
            tmp = x.loc[i,cols]
            tmp.rename({
                'address_lat':'%s:add_lat' % col,
                'address_lon':'%s:add_lon' % col,
            }, inplace = True)
            if ret is None:
                ret = tmp
            else:
                ret = pd.concat([ret, tmp])
    return ret

In [30]:
def predict_proba(df, ys = ['is_home', 'is_work']):
    for col in ys:
        pred = ('pred:%s' % col)
        df[pred] = model[col].predict_proba(df[xs])[:,1]
    return df.groupby('customer_id').apply(_best).reset_index()

In [31]:
def score(df, ys = ['is_home', 'is_work']):
    df_ret = predict_proba(df, ys)
    mean = 0.0
    for col in ys:
        col_mean = df_ret[col].mean()
        mean += col_mean
    if len(ys) == 2:
        mean = mean / len(ys)
    return mean

# Обучение

In [39]:
%%time
# фичи, на которых идет обучение
# 'is_pos', 'trans_in_day','country','currency'
xs = ['city','mcc','ratio1', 'district', 'address', 'weekday','terminal_use_freq', 'terminal_address']
ys = ['is_home', 'is_work']

# модели
model0 = {
    'is_home': xgb.XGBClassifier(max_depth=4, n_estimators=200, nthread = 3, seed=42),
    'is_work': xgb.XGBClassifier(n_estimators = 150, nthread = 3, seed=42),
}

model = {}

# последовательно обучаем два классификатора
for col in ['is_home', 'is_work']:
    
    #выберем для обучение транзакции только тех клиентов из train, у которых хоть в одной транзакции указано место работы/жительства
    cust_train = df[df['is_train'] == 1].groupby('customer_id')[col.replace('is_','has_')].max()
    cust_train = cust_train[cust_train > 0].index
    
    #разобъем train на train/valid для валидации
    cust_train, cust_valid = train_test_split(cust_train, test_size = 0.2, shuffle = True, random_state = 2)
    
    train = pd.DataFrame(cust_train, columns = ['customer_id']).merge(df, how = 'left')
    valid = pd.DataFrame(cust_valid, columns = ['customer_id']).merge(df, how = 'left')

    print ("Training:", col)
    clf = sklearn.base.clone(model0[col])
    clf.fit(train[xs], train[col], eval_metric = 'logloss', eval_set = [(train[xs], train[col]), (valid[xs], valid[col])], verbose=10)
    model[col] = clf
    print ("Train accuracy:", score(train, ys = [col]))
    print ("Test accuracy:", score(valid, ys = [col]))
    print ()
print(model['is_home'].feature_importances_)
print(model['is_work'].feature_importances_)

Training: is_home
[0]	validation_0-logloss:0.658649	validation_1-logloss:0.657755
[10]	validation_0-logloss:0.51282	validation_1-logloss:0.509981
[20]	validation_0-logloss:0.479313	validation_1-logloss:0.476438
[30]	validation_0-logloss:0.46695	validation_1-logloss:0.46565
[40]	validation_0-logloss:0.460062	validation_1-logloss:0.46066
[50]	validation_0-logloss:0.455729	validation_1-logloss:0.458579
[60]	validation_0-logloss:0.452927	validation_1-logloss:0.457373
[70]	validation_0-logloss:0.450226	validation_1-logloss:0.455687
[80]	validation_0-logloss:0.448018	validation_1-logloss:0.455401
[90]	validation_0-logloss:0.445527	validation_1-logloss:0.454466
[100]	validation_0-logloss:0.44338	validation_1-logloss:0.453722
[110]	validation_0-logloss:0.44102	validation_1-logloss:0.453383
[120]	validation_0-logloss:0.43925	validation_1-logloss:0.453033
[130]	validation_0-logloss:0.437491	validation_1-logloss:0.453147
[140]	validation_0-logloss:0.435192	validation_1-logloss:0.452649
[150]	vali

In [136]:
%%time
cust_test = df[df['is_train'] == 0]['customer_id'].unique()
test = pd.DataFrame(cust_test, columns = ['customer_id']).merge(df, how = 'left')
test = predict_proba(test)
test.rename(columns = {
        'customer_id':'_ID_',
        'is_home:add_lat': '_HOME_LAT_',
        'is_home:add_lon': '_HOME_LON_',
        'is_work:add_lat': '_WORK_LAT_',
        'is_work:add_lon': '_WORK_LON_'}, inplace = True)
test = test[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]

Wall time: 59.4 s


# Формируем submission-файл

In [137]:
# Заполняем пропуски
submission = submission.merge(test, how = 'left').fillna(0)

# Пишем файл submission
submission.to_csv('submission.csv', index = False)