In [1]:
import pandas as pd
import numpy as np
import datetime
import lightgbm as xgb
import sklearn
import math
from sklearn.model_selection import train_test_split

In [2]:
# Определим типы колонок для экономии памяти
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_adress_lat': np.float32,
    'pos_adress_lon': np.float32,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
    'mcckmeans_lat': np.float32,
    'mcckmeans_lon': np.float32,
    'kmeans_lat': np.float32,
    'kmeans_lon': np.float32,
}
                             
# для экономии памяти будем загружать только часть атрибутов транзакций
usecols_train = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 
                 'pos_adress_lat', 'pos_adress_lon', 'atm_address_lat', 'atm_address_lon','home_add_lat',
                 'home_add_lon','work_add_lat','work_add_lon']
usecols_test = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 
                'pos_address_lat', 'pos_address_lon', 'atm_address_lat', 'atm_address_lon']
usecols_dump = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 
                'pos_address_lat', 'pos_address_lon', 'atm_address_lat', 'atm_address_lon','home_add_lat',
                'home_add_lon','work_add_lat','work_add_lon','mcckmeans_lat','mcckmeans_lon','kmeans_lat','kmeans_lon']

In [3]:
train = pd.read_csv('train_set.csv', dtype = dtypes, usecols = usecols_train)
train.rename(columns = {'pos_adress_lat': 'pos_address_lat', 'pos_adress_lon': 'pos_address_lon'}, inplace = True)

test = pd.read_csv('test_set.csv', dtype = dtypes, usecols = usecols_test)
submission = pd.DataFrame(test['customer_id'].unique(), columns = ['_ID_'])

train['is_train'] = np.int32(1)
test['is_train'] = np.int32(0)
dt = pd.concat([train, test])

del train, test

In [4]:
dt['currency'] = dt['currency'].fillna(-1).astype(np.int32)
dt['mcc'] = dt['mcc'].apply(lambda x: int(x.replace(',', ''))).astype(np.int32)
dt['city'] = dt['city'].factorize()[0].astype(np.int32)
dt['country'] = dt['country'].factorize()[0].astype(np.int32)

dt.drop(dt[dt['transaction_date'].isnull()].index, axis = 0, inplace = True)
dt['transaction_date'] = dt['transaction_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

dt['weekday'] = dt['transaction_date'].dt.weekday.astype(np.int32)

In [5]:
dt['is_atm'] = (~dt['atm_address_lat'].isnull()).astype(np.int32)
dt['is_pos'] = (~dt['pos_address_lat'].isnull()).astype(np.int32)

dt['address_lat'] = dt['atm_address_lat'].fillna(0) + dt['pos_address_lat'].fillna(0)
dt['address_lon'] = dt['atm_address_lon'].fillna(0) + dt['pos_address_lon'].fillna(0)

dt.drop(['atm_address_lat','atm_address_lon','pos_address_lat','pos_address_lon'], axis = 1, inplace = True)

dt.drop(dt[((dt['address_lat'] == 0) & (dt['address_lon'] == 0))].index, axis = 0, inplace = True)

In [6]:
kmeanscentrs = set(('mcckmeans_lat', 'mcckmeans_lon','kmeans_lat', 'kmeans_lon'))
kmeanscentrs = sorted(list(kmeanscentrs))
dt['mcckmeans_lat']=dt['address_lat']
dt['mcckmeans_lon']=dt['address_lon']
dt['kmeans_lat']=dt['address_lat']
dt['kmeans_lon']=dt['address_lon']

customers = dt["customer_id"].value_counts().keys().tolist()

In [7]:
lat = dt['address_lat'] - dt['mcckmeans_lat']
lon = dt['address_lon'] - dt['mcckmeans_lon']
dt['dist_1'] = np.sqrt((lat ** 2) + (lon ** 2)).astype(np.float32)

lat = dt['address_lat'] - dt['kmeans_lat']
lon = dt['address_lon'] - dt['kmeans_lon']
dt['dist_2'] = np.sqrt((lat ** 2) + (lon ** 2)).astype(np.float32)

lat = dt['mcckmeans_lat'] - dt['kmeans_lat']
lon = dt['mcckmeans_lon'] - dt['kmeans_lon']
dt['dist_3'] = np.sqrt((lat ** 2) + (lon ** 2)).astype(np.float32)

In [8]:
lat = dt['home_add_lat'] - dt['address_lat']
lon = dt['home_add_lon'] - dt['address_lon']
dt['is_home'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.017).astype(np.int32)
dt['has_home'] = (~dt['home_add_lon'].isnull()).astype(np.int32)

lat = dt['work_add_lat'] - dt['address_lat']
lon = dt['work_add_lon'] - dt['address_lon']
dt['is_work'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.017).astype(np.int32)
dt['has_work'] = (~dt['work_add_lon'].isnull()).astype(np.int32)

dt.drop(['work_add_lat','work_add_lon','home_add_lat','home_add_lon'], axis = 1, inplace = True)

In [9]:
dt['address'] = dt['address_lat'].apply(lambda x: "%.02f" % x) + ';' + dt['address_lon'].apply(lambda x: "%.02f" % x)
dt['address'] = dt['address'].factorize()[0].astype(np.int32)

In [10]:
dt = dt.merge(dt.groupby('customer_id')['amount'].count().reset_index(name = 'tx'), how = 'left')
dt['tx'] = dt['tx'].astype(np.int32)

dt = dt.merge(dt.groupby(['customer_id','address'])['amount'].count().reset_index(name = 'tx_cust_addr'), how = 'left')
dt['tx_cust_addr'] = dt['tx_cust_addr'].astype(np.int32)

dt['ratio1'] = dt['tx_cust_addr'] / dt['tx']

In [11]:
def _best(x):
    ret = None
    for col in ys:
        pred = ('pred:%s' % col)
        if pred in x:
            i = (x[pred].idxmax())
            cols = [pred,'address_lat','address_lon']
            if col in x:
                cols.append(col)
            tmp = x.loc[i,cols]
            tmp.rename({
                'address_lat':'%s:add_lat' % col,
                'address_lon':'%s:add_lon' % col,
            }, inplace = True)
            if ret is None:
                ret = tmp
            else:
                ret = pd.concat([ret, tmp])
    return ret
    
def predict_proba(dt, ys = ['is_home', 'is_work']):
    for col in ys:
        pred = ('pred:%s' % col)
        dt[pred] = model[col].predict_proba(dt[xs])[:,1]
    return dt.groupby('customer_id').apply(_best).reset_index()

def score(dt, ys = ['is_home', 'is_work']):
    dt_ret = predict_proba(dt, ys)
    mean = 0.0
    for col in ys:
        col_mean = dt_ret[col].mean()
        mean += col_mean
    if len(ys) == 2:
        mean = mean / len(ys)
    return mean

In [13]:
xs = ['amount','city','country','currency','mcc','is_atm','is_pos','ratio1','dist_1',\
      'dist_2','dist_3']
ys = ['is_home', 'is_work']

model0 = {
    'is_home': xgb.LGBMClassifier(n_estimators = 77, n_jobs = 3),
    'is_work': xgb.LGBMClassifier(n_estimators = 15, n_jobs = 3),
}

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas(desc="apply")

model = {}

for col in  tqdm_notebook(['is_home', 'is_work']):
    
    cust_train = dt[dt['is_train'] == 1].groupby('customer_id')[col.replace('is_','has_')].max()
    cust_train = cust_train[cust_train > 0].index
    
    cust_train, cust_valid = train_test_split(cust_train, test_size = 0.1, shuffle = True, random_state = 2)
    
    train = pd.DataFrame(cust_train, columns = ['customer_id']).merge(dt, how = 'left')
    valid = pd.DataFrame(cust_valid, columns = ['customer_id']).merge(dt, how = 'left')

    print ("Training:", col)
    clf = sklearn.base.clone(model0[col])
    clf.fit(train[xs], train[col], eval_metric = 'logloss', eval_set = [(train[xs], train[col]), (valid[xs], valid[col])], verbose=10)

    model[col] = clf
    print ("Train accuracy:", score(train, ys = [col]))
    print ("Test accuracy:", score(valid, ys = [col]))
    print ()

A Jupyter Widget

Training: is_home
[10]	valid_0's binary_logloss: 0.496329	valid_1's binary_logloss: 0.491254
[20]	valid_0's binary_logloss: 0.456101	valid_1's binary_logloss: 0.451867
[30]	valid_0's binary_logloss: 0.445082	valid_1's binary_logloss: 0.442113
[40]	valid_0's binary_logloss: 0.440008	valid_1's binary_logloss: 0.438959
[50]	valid_0's binary_logloss: 0.436569	valid_1's binary_logloss: 0.438134
[60]	valid_0's binary_logloss: 0.433866	valid_1's binary_logloss: 0.438818
[70]	valid_0's binary_logloss: 0.431187	valid_1's binary_logloss: 0.439429
Train accuracy: 0.3638888888888889
Test accuracy: 0.356

Training: is_work
[10]	valid_0's binary_logloss: 0.43444	valid_1's binary_logloss: 0.44627
Train accuracy: 0.23675140025850927
Test accuracy: 0.23449612403100775




In [14]:
cust_test = dt[dt['is_train'] == 0]['customer_id'].unique()
test = pd.DataFrame(cust_test, columns = ['customer_id']).merge(dt, how = 'left')
test = predict_proba(test)
test.rename(columns = {
        'customer_id':'_ID_',
        'is_home:add_lat': '_HOME_LAT_',
        'is_home:add_lon': '_HOME_LON_',
        'is_work:add_lat': '_WORK_LAT_',
        'is_work:add_lon': '_WORK_LON_'}, inplace = True)
test = test[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]

In [15]:
submission = submission.merge(test, how = 'left').fillna(0)

submission.to_csv('submit13.csv', index = False)