In [1]:
import pandas as pd
import numpy as np
import datetime
import re
import pickle
import sys
from collections import Counter

import xgboost as xgb
import sklearn
from sklearn.model_selection import train_test_split

import geocoder
import pycountry

from tqdm import tqdm_notebook, tqdm

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'terminal_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_adress_lat': np.float32,
    'pos_adress_lon': np.float32,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
}

# Загрузка данных

In [4]:
train = pd.read_csv('data/train_set.csv.gz', compression='gzip', dtype=dtypes)
train.rename(columns = {'pos_adress_lat': 'pos_address_lat', 'pos_adress_lon': 'pos_address_lon'}, inplace = True)

test = pd.read_csv('data/test_set.csv.gz', compression='gzip', dtype=dtypes)
submission = pd.DataFrame(test['customer_id'].unique(), columns = ['_ID_'])

# соединяем test/train в одном DataFrame
train['is_train'] = np.int32(1)
test['is_train'] = np.int32(0)
dt = pd.concat([train, test])

del train, test

In [4]:
# train = train[(train.country == 'RU ') | (train.country == 'RUS')]
# countries_all = set(train[~train.country.isnull()].country.apply(clean))

# Нормализация городов

In [5]:
def clean(entity):
    return re.sub('\s+', ' ', entity.strip())

def country_resolve(country):
    if not (3 >= len(country) >= 2):
        return None
    try:
        return pycountry.countries.lookup(country)
    except LookupError:    
        if len(country) != 3:
            return None
        try:
            return pycountry.countries.lookup(country[:2])
        except LookupError:
            return None
    
country_resolve('RU')

Country(alpha_2='RU', alpha_3='RUS', name='Russian Federation', numeric='643')

In [6]:
cities_all = set(
    dt[(~dt.city.isnull()) & (~dt.country.isnull())] \
        .apply(lambda x: (clean(x['city']), clean(x['country'])), axis=1))

In [7]:
cities_geocodes = pickle.load(open("cities_geocodes.p", "rb" ))

In [8]:
raise Exception('load cities from the pickle')

cities_geocodes = {}

for i in range(0, 10):
    print('>>>', i)
    
    chunk_size = 1000
    for key in tqdm_notebook(list(cities_all)[chunk_size*i : chunk_size*(i+1)]):
        #if idx % 10 == 0:
        #    print(idx, len(cities_all))
        #    sys.__stdout__.write("{} {}\n".format(idx, len(cities_all)))

        #if city is None or city == '':
        #    print(city, country)
        #if country is None or country == '':
        #    print(city, country)
        city, country = key
        if key in cities_geocodes and cities_geocodes[key]['status'] == 'OK':
            continue

        country_resolved = country_resolve(country)
        if country_resolved is None:
            continue
        country_resolved_name = country_resolved.name
        if country_resolved_name == 'Russian Federation':
            country_resolved_name = 'Russia'

        attempts = 12
        g = None
        while g is None or (attempts > 0 and g.status != 'OK'):
            g = geocoder.yandex(city + ', ' + country_resolved_name)
            attempts -= 1

        if g.status == 'OK':
            cities_geocodes.update({key: g.json})
    
    pickle.dump(cities_geocodes, open('cities_geocodes_{}.p'.format(i), 'wb'))

Exception: load cities from the pickle

In [9]:
dt.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,is_train,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
0,2.884034,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,1,5261,,59.844074,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177
1,2.775633,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,1,5261,,59.844074,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177
2,3.708368,,,,St Petersburg,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,1,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.8582,30.229023,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177
3,2.787498,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,1,5261,,59.844074,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177
4,2.89251,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,1,5261,,59.844074,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177


In [10]:
def clean_country(country):
    cntr = country_resolve(clean(country))
    if cntr is None:
        return None
    return cntr.name

dt['country_clean'] = dt['country'].apply(clean_country)

In [11]:
def clean_city(row):
    country = clean(str(row['country']))
    city = clean(str(row['city']))
    key = (city, country)
    
    if key in cities_geocodes:
        val = cities_geocodes[key]['raw']['name']
        #if val not in {'Saint Petersburg', 'Novosibirsk', 'Moscow', 'Novorossiysk'}:
        #    print(city, '|', country, '||', val)
        return val
    else:
        return None

dt['city_clean'] = dt.apply(clean_city, axis=1)

# Нормализация MCC

In [12]:
dt['mcc'] = dt['mcc'].apply(lambda x: int(str(x).replace(',', ''))).astype(np.int32)

In [39]:
mcc_codes = pd.read_csv('data/mcc_codes.csv')

mcc_codes_dict = {}
for row in list(mcc_codes.iterrows()):
    mcc = row[1]['mcc']
    gr = row[1]['irs_description']
    if gr is None or gr == '' or (isinstance(gr, float) and np.isnan(gr)):
        continue
    #print(row[1]['mcc'])
    #print(row[1]['irs_description'])
    mcc_codes_dict.update({mcc: gr})

def mcc_group_do(mcc_code):
    if mcc_code == 5816:
        return 'Digital Goods: Games'
    else:
        return mcc_codes_dict[mcc_code]
    
dt['mcc_group'] = dt['mcc'].apply(mcc_group_do).factorize()[0].astype(np.int32)
    
del mcc_codes

# Факторизация города и страны

In [13]:
dt['city_clean_factorized'] = dt['city_clean'].factorize()[0].astype(np.int32)
dt['country_clean_factorized'] = dt['country_clean'].factorize()[0].astype(np.int32)

In [14]:
dt.shape

(2490204, 23)

# Удаляем транзакции без даты

In [15]:
dt.drop(dt[dt['transaction_date'].isnull()].index, axis = 0, inplace = True)
dt['transaction_date'] = dt['transaction_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

In [16]:
dt.shape

(2490114, 23)

### Фичи для даты

In [17]:
print(dt.transaction_date.min(), dt.transaction_date.max())

2017-01-27 00:00:00 2017-12-08 00:00:00


In [18]:
dt['weekday'] = dt['transaction_date'].dt.weekday.astype(np.int32)

In [19]:
months = ['Январь','Февраль','Март','Апрель','Май','Июнь','Июль','Август','Сентябрь','Октябрь','Ноябрь','Декабрь']
nonworking_days = set()

for month_idx, row in enumerate(pd.read_csv('data/data_nonworking_days_russia.csv').loc[18,months]):
    #print(month_idx, row.split(','))
    for day in row.split(','):
        if day[-1] == '*':
            continue
        nonworking_days.add(datetime.datetime(2017, month_idx + 1, int(day)))
    
del months

print(datetime.datetime(2017, 5, 9) in nonworking_days)

True


In [20]:
def is_working_day_transform(dtime):
    if dtime in nonworking_days:
        return 0
    else:
        return 1

dt['is_working_day'] = dt['transaction_date'].apply(is_working_day_transform).astype(np.int32)

### Приводим адрес транзакции для pos и atm-транзакций к единообразному виду

In [21]:
dt['is_atm'] = (~dt['atm_address_lat'].isnull() & ~dt['atm_address_lon'].isnull()).astype(np.int32)
dt['is_pos'] = (~dt['pos_address_lat'].isnull() & ~dt['pos_address_lon'].isnull()).astype(np.int32)

dt['address_lat'] = dt['atm_address_lat'].fillna(0) + dt['pos_address_lat'].fillna(0)
dt['address_lon'] = dt['atm_address_lon'].fillna(0) + dt['pos_address_lon'].fillna(0)

dt.drop(['atm_address_lat','atm_address_lon','pos_address_lat','pos_address_lon'], axis=1, inplace=True)

# удалим транзакции без адреса
dt.drop(dt[((dt['address_lon'] == 0) & (dt['address_lat'] == 0))].index, axis=0, inplace=True)

### Генерируем признаки is_home, is_work

In [22]:
lat = dt['home_add_lat'] - dt['address_lat']
lon = dt['home_add_lon'] - dt['address_lon']
dt['is_home'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
dt['has_home'] = (~dt['home_add_lon'].isnull()).astype(np.int32)

lat = dt['work_add_lat'] - dt['address_lat']
lon = dt['work_add_lon'] - dt['address_lon']
dt['is_work'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
dt['has_work'] = (~dt['work_add_lon'].isnull()).astype(np.int32)

dt.drop(['work_add_lat','work_add_lon','home_add_lat','home_add_lon'], axis=1, inplace=True)

### Генерируем категориальный признак для адреса

In [23]:
dt['address'] = dt['address_lat'].apply(lambda x: "%.02f" % x) + ';' + dt['address_lon'].apply(lambda x: "%.02f" % x)
dt['address'] = dt['address'].factorize()[0].astype(np.int32)

In [24]:
(dt['address_lat'].apply(lambda x: "%.02f" % x) + ';' + dt['address_lon'].apply(lambda x: "%.02f" % x)).to_csv('tmp.csv')

### Генерируем несколько абонентских фич

In [25]:
dt.currency.unique()

array([643., 978., 784., 840.])

In [30]:
average_rates = {
    643: 1.0, 
    978: 65.8714, 
    784: 16.0,
    840: 58.3086
}

def amount_clean_do(row):
    currency = row['currency']
    return row['amount'] * average_rates[currency]
dt['amount_clean'] = dt.apply(amount_clean_do, axis=1)

del average_rates

In [32]:
# количество транзакций каждого клиента
dt = dt.merge(dt.groupby('customer_id')['amount_clean'].count().reset_index(name = 'tx'), how = 'left')
dt['tx'] = dt['tx'].astype(np.int32)

dt = dt.merge(dt.groupby(['customer_id','address'])['amount_clean'].count().reset_index(name = 'tx_cust_addr'), how = 'left')
dt['tx_cust_addr'] = dt['tx_cust_addr'].astype(np.int32)

# какая часть транзакций клиента приходится на данный адрес
dt['ratio1'] = dt['tx_cust_addr'] / dt['tx']

In [33]:
dt = dt.merge(dt.groupby(['customer_id', 'mcc'])['amount_clean'].count().reset_index(name='mcc_distinct'), how='left')

## Вспомогательные функции для оценки точности классификатора

In [41]:
def _best(x):
    ret = None
    for col in ys:
        pred = ('pred:%s' % col)
        if pred in x:
            i = (x[pred].idxmax())
            cols = [pred,'address_lat','address_lon']
            if col in x:
                cols.append(col)
            tmp = x.loc[i,cols]
            tmp.rename({
                'address_lat':'%s:add_lat' % col,
                'address_lon':'%s:add_lon' % col,
            }, inplace = True)
            if ret is None:
                ret = tmp
            else:
                ret = pd.concat([ret, tmp])
    return ret

In [42]:
def predict_proba(dt, ys = ['is_home', 'is_work']):
    for col in ys:
        pred = ('pred:%s' % col)
        dt[pred] = model[col].predict_proba(dt[xs])[:,1]
    return dt.groupby('customer_id').apply(_best).reset_index()

In [43]:
def score(dt, ys = ['is_home', 'is_work']):
    dt_ret = predict_proba(dt, ys)
    mean = 0.0
    for col in ys:
        col_mean = dt_ret[col].mean()
        mean += col_mean
    if len(ys) == 2:
        mean = mean / len(ys)
    return mean

### Признаки, на которых будем обучать модель

In [44]:
dt.columns

Index(['amount', 'atm_address', 'city', 'country', 'currency', 'customer_id',
       'is_train', 'mcc', 'pos_address', 'terminal_id', 'transaction_date',
       'country_clean', 'city_clean', 'city_clean_factorized',
       'country_clean_factorized', 'weekday', 'is_working_day', 'is_atm',
       'is_pos', 'address_lat', 'address_lon', 'is_home', 'has_home',
       'is_work', 'has_work', 'address', 'amount_clean', 'tx', 'tx_cust_addr',
       'ratio1', 'mcc_distinct', 'mcc_group'],
      dtype='object')

In [45]:
xs = ['amount_clean','currency','city_clean_factorized','country_clean_factorized','mcc', 'mcc_group', 'mcc_distinct', 'is_atm',
      'is_pos','weekday','is_working_day','ratio1']
ys = ['is_home', 'is_work']

In [46]:
params = {
    'max_depth': 10,
    'n_estimators': 670,
    'learning_rate': 0.09,
    'colsample_bytree': 0.9,
    'colsample_bylevel': 0.6,
    'n_jobs': -1
}
model0 = {
    'is_home': xgb.XGBClassifier(params=params),
    'is_work': xgb.XGBClassifier(params=params),
}

In [47]:
model = {}

# последовательно обучаем два классификатора
for col in ['is_home', 'is_work']:
    
    #выберем для обучение транзакции только тех клиентов из train, 
    #у которых хоть в одной транзакции указано место работы/жительства
    cust_train = dt[dt['is_train'] == 1].groupby('customer_id')[col.replace('is_','has_')].max()
    cust_train = cust_train[cust_train > 0].index
    
    #разобъем train на train/valid для валидации
    cust_train, cust_valid = train_test_split(cust_train, test_size = 0.1, shuffle = True, random_state = 2)
    
    train = pd.DataFrame(cust_train, columns = ['customer_id']).merge(dt, how='left')
    valid = pd.DataFrame(cust_valid, columns = ['customer_id']).merge(dt, how='left')

    print("Training:", col)
    clf = sklearn.base.clone(model0[col])
    clf.fit(train[xs], train[col], eval_metric = 'logloss', eval_set = [(train[xs], train[col]), (valid[xs], valid[col])], verbose=10)
    model[col] = clf
    print("Train accuracy:", score(train, ys = [col]))
    print("Test accuracy:", score(valid, ys = [col]))
    print()

Training: is_home
[0]	validation_0-logloss:0.659771	validation_1-logloss:0.658203
[10]	validation_0-logloss:0.521018	validation_1-logloss:0.512803
[20]	validation_0-logloss:0.493036	validation_1-logloss:0.484286
[30]	validation_0-logloss:0.483494	validation_1-logloss:0.474773
[40]	validation_0-logloss:0.478438	validation_1-logloss:0.470081
[50]	validation_0-logloss:0.475763	validation_1-logloss:0.469084
[60]	validation_0-logloss:0.472877	validation_1-logloss:0.46817
[70]	validation_0-logloss:0.47062	validation_1-logloss:0.466971
[80]	validation_0-logloss:0.468469	validation_1-logloss:0.466239
[90]	validation_0-logloss:0.467252	validation_1-logloss:0.465948
[99]	validation_0-logloss:0.465932	validation_1-logloss:0.465253
Train accuracy: 0.3923333333333333
Test accuracy: 0.374

Training: is_work
[0]	validation_0-logloss:0.64721	validation_1-logloss:0.647118
[10]	validation_0-logloss:0.45376	validation_1-logloss:0.453329
[20]	validation_0-logloss:0.412395	validation_1-logloss:0.412821
[30

In [48]:
cust_test = dt[dt['is_train'] == 0]['customer_id'].unique()
test = pd.DataFrame(cust_test, columns = ['customer_id']).merge(dt, how = 'left')
test = predict_proba(test)
test.rename(columns = {
        'customer_id':'_ID_',
        'is_home:add_lat': '_HOME_LAT_',
        'is_home:add_lon': '_HOME_LON_',
        'is_work:add_lat': '_WORK_LAT_',
        'is_work:add_lon': '_WORK_LON_'}, inplace = True)
test = test[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]

In [49]:
submission = submission.merge(test, how='left').fillna(0)

submission.to_csv('myltsev_submission.csv', index=False)