# Raiffeisen Data Cup 
### 112 LB / Final 116 of 548 LB 0.344339

На основе baseline от Антон Чикин 0.328625 (https://vk.com/boosters_official?w=wall-117459195_526)

Добавлены фичи про выходные дни и категории MCC (так как даже разные заправки имеют разный код), эта часть помечена как mktoid features
Запускалось на Google Colab 


In [1]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

downloaded = drive.CreateFile({'id': '1n3VrV6Hx_jXLW9fc7HGHEhBL3mo-qx_a'})
print('Downloaded content "{}"'.format((downloaded['title'])))
downloaded.GetContentFile('sample.csv')

downloaded = drive.CreateFile({'id': '1Plpet7kjTUQZvdKSV9bZUE1w6yXfEuQO'})
print('Downloaded content "{}"'.format((downloaded['title'])))
downloaded.GetContentFile(downloaded['title'])

downloaded = drive.CreateFile({'id': '1mtLiIBIanehQ46CkPkf9gtFQcCGBxk2Z'})
print('Downloaded content "{}"'.format((downloaded['title'])))
downloaded.GetContentFile(downloaded['title'])

downloaded = drive.CreateFile({'id': '1LT2noXNFFgm9iyIzaRngIvbiYXHrpeml'})
print('Downloaded content "{}"'.format((downloaded['title'])))
downloaded.GetContentFile(downloaded['title'])

downloaded = drive.CreateFile({'id': '1XIDBqKK2xsXzZLYn5Fo8NiSTR09TvEKr'})
print('Downloaded content "{}"'.format((downloaded['title'])))
downloaded.GetContentFile(downloaded['title'])


Downloaded content "sample.csv"
Downloaded content "test_set.csv"
Downloaded content "train_set.csv"
Downloaded content "Spravochnik_MCC_Cod w9l.xls"
Downloaded content "data-20171211T1403-structure-20171211T1403.csv"


In [0]:
# https://github.com/dmlc/xgboost
# This specific version is a work-around for a build issue in newer versions.
!pip install -q xgboost==0.4a30

In [0]:
import pandas as pd
import numpy as np
import datetime

import xgboost as xgb
import sklearn

from sklearn.model_selection import train_test_split

In [0]:
# Определим типы колонок для экономии памяти
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_adress_lat': np.float32,
    'pos_adress_lon': np.float32,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
}

# для экономии памяти будем загружать только часть атрибутов транзакций
usecols_train = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_adress_lat', 'pos_adress_lon', 'atm_address_lat', 'atm_address_lon','home_add_lat','home_add_lon','work_add_lat','work_add_lon']
usecols_test = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_address_lat', 'pos_address_lon', 'atm_address_lat', 'atm_address_lon']

## Читаем train_set, test_set, соединяем в один датасет

In [0]:
train = pd.read_csv('train_set.csv', dtype = dtypes, usecols = usecols_train)
train.rename(columns = {'pos_adress_lat': 'pos_address_lat', 'pos_adress_lon': 'pos_address_lon'}, inplace = True)

test = pd.read_csv('test_set.csv', dtype = dtypes, usecols = usecols_test)
submission = pd.DataFrame(test['customer_id'].unique(), columns = ['_ID_'])

# соединяем test/train в одном DataFrame
train['is_train'] = np.int32(1)
test['is_train'] = np.int32(0)
dt = pd.concat([train, test])

del train, test

### Обрабатываем дату транзакции и категориальные признаки

In [0]:
dt['currency'] = dt['currency'].fillna(-1).astype(np.int32)
dt['mcc'] = dt['mcc'].apply(lambda x: int(x.replace(',', ''))).astype(np.int32)
dt['city'] = dt['city'].factorize()[0].astype(np.int32)
dt['country'] = dt['country'].factorize()[0].astype(np.int32)

# удаляем транзакции без даты
dt.drop(dt[dt['transaction_date'].isnull()].index, axis = 0, inplace = True)
dt['transaction_date'] = dt['transaction_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

### Фичи для даты

In [0]:
dt['weekday'] = dt['transaction_date'].dt.weekday.astype(np.int32)

### Приводим адрес транзакции для pos и atm-транзакций к единообразному виду

In [0]:
dt['is_atm'] = (~dt['atm_address_lat'].isnull()).astype(np.int32)
dt['is_pos'] = (~dt['pos_address_lat'].isnull()).astype(np.int32)

dt['address_lat'] = dt['atm_address_lat'].fillna(0) + dt['pos_address_lat'].fillna(0)
dt['address_lon'] = dt['atm_address_lon'].fillna(0) + dt['pos_address_lon'].fillna(0)

dt.drop(['atm_address_lat','atm_address_lon','pos_address_lat','pos_address_lon'], axis = 1, inplace = True)

# удалим транзакции без адреса
dt.drop(dt[((dt['address_lon'] == 0) & (dt['address_lon'] == 0))].index, axis = 0, inplace = True)

### Генерируем признаки is_home, is_work

In [0]:
lat = dt['home_add_lat'] - dt['address_lat']
lon = dt['home_add_lon'] - dt['address_lon']
dt['is_home'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
dt['has_home'] = (~dt['home_add_lon'].isnull()).astype(np.int32)

lat = dt['work_add_lat'] - dt['address_lat']
lon = dt['work_add_lon'] - dt['address_lon']
dt['is_work'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
dt['has_work'] = (~dt['work_add_lon'].isnull()).astype(np.int32)

dt.drop(['work_add_lat','work_add_lon','home_add_lat','home_add_lon'], axis = 1, inplace = True)

### Генерируем категориальный признак для адреса

In [0]:
dt['address'] = dt['address_lat'].apply(lambda x: "%.02f" % x) + ';' + dt['address_lon'].apply(lambda x: "%.02f" % x)
dt['address'] = dt['address'].factorize()[0].astype(np.int32)

### Генерируем несколько абонентских фич

In [0]:
# количество транзакций каждого клиента
dt = dt.merge(dt.groupby('customer_id')['amount'].count().reset_index(name = 'tx'), how = 'left')
dt['tx'] = dt['tx'].astype(np.int32)

dt = dt.merge(dt.groupby(['customer_id','address'])['amount'].count().reset_index(name = 'tx_cust_addr'), how = 'left')
dt['tx_cust_addr'] = dt['tx_cust_addr'].astype(np.int32)

# какая часть транзакций клиента приходится на данный адрес
dt['ratio1'] = dt['tx_cust_addr'] / dt['tx']

In [12]:
dt.sample()

Unnamed: 0,amount,city,country,currency,customer_id,is_train,mcc,transaction_date,weekday,is_atm,...,address_lat,address_lon,is_home,has_home,is_work,has_work,address,tx,tx_cust_addr,ratio1
1169967,2.625629,1217,0,643,2471e684f2bf5ddadbd61d3c199521e6,0,5411,2017-08-12,5,0,...,56.852627,35.849838,0,0,0,0,3167,84,3,0.035714


In [13]:
# mktoid features

!pip install xlrd

mcc_codes = pd.read_excel("Spravochnik_MCC_Cod w9l.xls") # data from http://www.banki.ru/wikibank/mcc-kod/
mcc_codes.drop(0, axis=0, inplace=True)

mcc_cats = mcc_codes[["Справочник МСС-кодов","Unnamed: 1", "Unnamed: 2"]]

mccc = {}

for index, row in mcc_cats.iterrows():
  mccc[int(row['Справочник МСС-кодов'])] = row["Unnamed: 2"]
  
def mcc2mcc_cat(mcc): #usage mcc2mcc_cat(5261)
  try:
    return mccc[int(mcc)]
  except:
    return 0
  
mcctypes = {}

for index, row in mcc_cats.iterrows():
  mcctypes[int(row['Справочник МСС-кодов'])] = row["Unnamed: 1"]

def mcc2mcc_type(mcc): #usage mcc2mcc_cat(5261)
  try:
    return mcctypes[int(mcc)]
  except:
    return 0
  
holidays = pd.read_csv('data-20171211T1403-structure-20171211T1403.csv') # data from data.gov.ru/opendata/7708660670-proizvcalendar

hol_cache = {}

def is_holiday(strdate):
  
  strdate = str(strdate)
  
  if strdate in hol_cache.keys():
    return hol_cache[strdate]

  try:
    year, month, day = strdate.split('-')
  except:
    return -1;
  
  colsindexes = {}
  i = 0
  for col in holidays.columns:
    colsindexes[i] = col
    i = i+1

  colsindexes[1]

  hol_in_mnt = holidays[holidays['Год/Месяц'] == int(year)][colsindexes[int(month)]].values[0]
  hols = hol_in_mnt.replace('*','').split(',')

  if str(int(day)) in hols:
    hol_cache[strdate] = 1
    return 1
  else:
    hol_cache[strdate] = 0
    return 0
  
from dateutil import parser

wd_cache = {}

def weekday(string):
  if string in wd_cache.keys():
    return wd_cache[string]
  try:
    dt = parser.parse(string)
    wd_cache[string] = dt.weekday()
    return dt.weekday()
  except:
    wd_cache[string] = -1
    return -1

Collecting xlrd
  Downloading xlrd-1.1.0-py2.py3-none-any.whl (108kB)
[K    100% |████████████████████████████████| 112kB 2.3MB/s 
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-1.1.0


In [14]:
%%time

dt['mcc_type'] = dt['mcc'].apply(mcc2mcc_type).astype(str)

dt['mcc_group'] = dt['mcc'].apply(mcc2mcc_cat).astype(str)

dt['currency'] = dt['currency'].fillna(0).astype(int)

dt['is_russia'] = dt['country'].apply(lambda x: 1 if x == 'RUS' or x == 'RU' else 0)

dt['is_holiday'] = dt['transaction_date'].astype(str).apply(is_holiday)


CPU times: user 12.6 s, sys: 424 ms, total: 13.1 s
Wall time: 13.1 s


In [0]:
from sklearn.preprocessing import LabelEncoder

le_type = LabelEncoder()
le_group = LabelEncoder()

le_type.fit(dt['mcc_type'])
le_group.fit(dt['mcc_group'])

dt['mcc_type'] = le_type.transform(dt['mcc_type'])

dt['mcc_group'] = le_group.transform(dt['mcc_group'])

## Вспомогательные функции для оценки точности классификатора

In [0]:
def _best(x):
    ret = None
    for col in ys:
        pred = ('pred:%s' % col)
        if pred in x:
            i = (x[pred].idxmax())
            cols = [pred,'address_lat','address_lon']
            if col in x:
                cols.append(col)
            tmp = x.loc[i,cols]
            tmp.rename({
                'address_lat':'%s:add_lat' % col,
                'address_lon':'%s:add_lon' % col,
            }, inplace = True)
            if ret is None:
                ret = tmp
            else:
                ret = pd.concat([ret, tmp])
    return ret

In [0]:
def predict_proba(dt, ys = ['is_home', 'is_work']):
    for col in ys:
        pred = ('pred:%s' % col)
        dt[pred] = model[col].predict_proba(dt[xs])[:,1]
    return dt.groupby('customer_id').apply(_best).reset_index()

In [0]:
def score(dt, ys = ['is_home', 'is_work']):
    dt_ret = predict_proba(dt, ys)
    mean = 0.0
    for col in ys:
        col_mean = dt_ret[col].mean()
        mean += col_mean
    if len(ys) == 2:
        mean = mean / len(ys)
    return mean

### Признаки, на которых будем обучать модель

In [0]:
xs = ['amount','currency','city','country','is_atm','is_pos','ratio1','mcc_type','mcc_group','is_russia','is_holiday','weekday']
##xs = ['amount','currency','is_atm','is_pos','ratio1','mcc_type','mcc_group','is_russia','is_holiday','weekday']
ys = ['is_home', 'is_work']

# Создаем классификаторы
**Hint**: можно поигратьcя с гиперпараметрами для лучшего результата :)

In [0]:
model0 = {
    'is_home': xgb.XGBClassifier(),
    'is_work': xgb.XGBClassifier(),
}

# Обучаем классификаторы

In [21]:
model = {}

# последовательно обучаем два классификатора
for col in ['is_home', 'is_work']:
    
    #выберем для обучение транзакции только тех клиентов из train, у которых хоть в одной транзакции указано место работы/жительства
    cust_train = dt[dt['is_train'] == 1].groupby('customer_id')[col.replace('is_','has_')].max()
    cust_train = cust_train[cust_train > 0].index
    
    #разобъем train на train/valid для валидации
    cust_train, cust_valid = train_test_split(cust_train, test_size = 0.1, shuffle = True, random_state = 2)
    
    train = pd.DataFrame(cust_train, columns = ['customer_id']).merge(dt, how = 'left')
    valid = pd.DataFrame(cust_valid, columns = ['customer_id']).merge(dt, how = 'left')

    print ("Training:", col)
    clf = sklearn.base.clone(model0[col])
    clf.fit(train[xs], train[col], eval_metric = 'logloss', eval_set = [(train[xs], train[col]), (valid[xs], valid[col])], verbose=10)
    model[col] = clf
    print ("Train accuracy:", score(train, ys = [col]))
    print ("Test accuracy:", score(valid, ys = [col]))
    print ()


Training: is_home


[0]	validation_0-logloss:0.660081	validation_1-logloss:0.657865
[10]	validation_0-logloss:0.520271	validation_1-logloss:0.511804
[20]	validation_0-logloss:0.492357	validation_1-logloss:0.482038
[30]	validation_0-logloss:0.483562	validation_1-logloss:0.473762
[40]	validation_0-logloss:0.479480	validation_1-logloss:0.471127
[50]	validation_0-logloss:0.476409	validation_1-logloss:0.468769
[60]	validation_0-logloss:0.474438	validation_1-logloss:0.467155
[70]	validation_0-logloss:0.472759	validation_1-logloss:0.466255
[80]	validation_0-logloss:0.471724	validation_1-logloss:0.465397
[90]	validation_0-logloss:0.470486	validation_1-logloss:0.464576
[99]	validation_0-logloss:0.469717	validation_1-logloss:0.464712


Train accuracy: 0.38522222222222224
Test accuracy: 0.371

Training: is_work


[0]	validation_0-logloss:0.646982	validation_1-logloss:0.646691
[10]	validation_0-logloss:0.452797	validation_1-logloss:0.451160
[20]	validation_0-logloss:0.412228	validation_1-logloss:0.409166
[30]	validation_0-logloss:0.401046	validation_1-logloss:0.398479
[40]	validation_0-logloss:0.396597	validation_1-logloss:0.395533
[50]	validation_0-logloss:0.393499	validation_1-logloss:0.393837
[60]	validation_0-logloss:0.391409	validation_1-logloss:0.393847
[70]	validation_0-logloss:0.389794	validation_1-logloss:0.393996
[80]	validation_0-logloss:0.388342	validation_1-logloss:0.393981
[90]	validation_0-logloss:0.387277	validation_1-logloss:0.393525
[99]	validation_0-logloss:0.386290	validation_1-logloss:0.393387


Train accuracy: 0.2725118483412322
Test accuracy: 0.26744186046511625



# Predict

In [0]:
cust_test = dt[dt['is_train'] == 0]['customer_id'].unique()
test = pd.DataFrame(cust_test, columns = ['customer_id']).merge(dt, how = 'left')
test = predict_proba(test)
test.rename(columns = {
        'customer_id':'_ID_',
        'is_home:add_lat': '_HOME_LAT_',
        'is_home:add_lon': '_HOME_LON_',
        'is_work:add_lat': '_WORK_LAT_',
        'is_work:add_lon': '_WORK_LON_'}, inplace = True)
test = test[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]

# Формируем submission-файл

In [0]:
# Заполняем пропуски
submission = submission.merge(test, how = 'left').fillna(0)

# Пишем файл submission
submission.to_csv('baseline-very-simple.csv', index = False)

In [0]:
from google.colab import files

files.download('baseline-very-simple.csv')