In [1]:
import pandas as pd
import numpy as np
import re
from lightgbm import LGBMClassifier

from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas(desc='apply')

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 300)

In [2]:
def dist(data , lat1 , lat2 , lon1 , lon2):
    return np.sqrt((data[lat1] - data[lat2]) ** 2 + (data[lon1] - data[lon2]) ** 2)

def dist_from_series(lat1 , lat2 , lon1 , lon2):
    return np.sqrt((lat1 - lat2) ** 2 + (lon1 - lon2) ** 2)


model_params = {
    'n_estimators':100, 
    'subsample':0.7,
    'random_state':42,
    'n_jobs':-1
}

def get_columns(data):
    to_drop = ['target_home', 'target_work', 'customer_id', 'transaction_date',
               'home_add_lat','home_add_lon','work_add_lat','work_add_lon', 
               'address_lat' , 'address_lon' , 'address']
    columns = list(set(data.columns) - set(to_drop) - set(data.dtypes[data.dtypes == object].keys()))
    return columns

def predict(model , data , columns , target):
    data = data.copy()
    data['predict_' + target] = model.predict_proba(data[columns])[:,1]
    tmp = data.groupby(['customer_id' , 'address_lat' , 'address_lon'])[['predict_' + target]].max()
    tmp = tmp.groupby(['customer_id']).idxmax()['predict_' + target].values
    predict = pd.DataFrame(tmp.tolist() , columns=['customer_id' , target + '_predict_lat' ,  target + '_predict_lon'])
    return predict

def get_good_data_for_valid(data):
    data = data.copy()
    columns = get_columns(data)
    
    data = data[data['home' + '_add_lat'].notnull() & data['work' + '_add_lat'].notnull()]
    data = data[data['address_lat'].notnull()]
    return data , columns

def validate_for_target(data , columns , cv):
    res = []
    
    for train_index , valid_index in tqdm_notebook(cv.split(X = data , groups=data['customer_id']) , total = cv.n_splits):
        x_tr , x_vl = data.iloc[train_index] , data.iloc[valid_index]
        
        res_for_fold = 0
        for target in ['home' , 'work']:
            lgbm = LGBMClassifier(**model_params)
            lgbm.fit(x_tr[columns] , x_tr['target_' + target])
            predict_data = predict(lgbm , x_vl , columns , target)

            real_answer = x_vl.groupby(['customer_id'] , as_index=False)[target + '_add_lat',target + '_add_lon'].mean()
            in_radius = dist_from_series(real_answer[target + '_add_lat'] , predict_data[target + '_predict_lat'] ,
                                         real_answer[target + '_add_lon'] , predict_data[target + '_predict_lon']) < 0.02
            res_for_fold += in_radius.mean()
        res.append(res_for_fold/2)
    return np.array(res)

def valid(data , n_splits = 5):
    data , columns = get_good_data_for_valid(data)
    k_fold = GroupKFold(n_splits = n_splits)
    res = validate_for_target(data  , columns,  k_fold)
    return res

In [3]:
train = pd.read_csv('data/train_set.csv').rename(columns={"pos_adress_lat": "pos_address_lat","pos_adress_lon": "pos_address_lon"})
test = pd.read_csv('data/test_set.csv')
sample = pd.read_csv('data/sample.csv')

### Clean data

In [4]:
train = train[train.work_add_lat.notnull() | train.home_add_lat.notnull()]
train['is_train'] = 1
test['mcc'] = test['mcc'].apply(lambda x: str(x).replace(',', '')).astype(np.int32)

all_data = pd.concat([train , test])

all_data['is_atm'] = all_data['atm_address_lat'].notnull().astype('int8')
all_data['is_pos'] = all_data['pos_address_lat'].notnull().astype('int8')

all_data['address_lat'] = all_data['atm_address_lat'].fillna(0) + all_data['pos_address_lat'].fillna(0)
all_data['address_lon'] = all_data['atm_address_lon'].fillna(0) + all_data['pos_address_lon'].fillna(0)
all_data['address'] = all_data['atm_address'].fillna('') + all_data['pos_address'].fillna('')

all_data.drop(['atm_address_lat', 'atm_address_lon', 'atm_address',
                  'pos_address_lat', 'pos_address_lon', 'pos_address'] , axis=1 , inplace=True)

In [5]:
good_terminal = all_data.groupby('terminal_id' , as_index=False)['address_lat','address_lon']
all_data = pd.merge(all_data , good_terminal.mean() , on='terminal_id' , how='left' , suffixes=('_old',''))
all_data = pd.merge(all_data , good_terminal.count()[['terminal_id', 'address_lat']] , on='terminal_id' , how='left' , suffixes=('','_count'))

all_data.drop(['address_lat_old' , 'address_lon_old'] , axis=1 , inplace=True)

### Target

In [6]:
train = all_data[all_data['is_train'].notnull()]
test = all_data[all_data['is_train'].isnull()]
train['target_home'] = (dist(train , 'home_add_lat' , 'address_lat' , 'home_add_lon' , 'address_lon') < 0.02).astype('int8')
train['target_work'] = (dist(train , 'work_add_lat' , 'address_lat' , 'work_add_lon' , 'address_lon') < 0.02).astype('int8')

### Features from address

In [7]:
russian_word = 'АБВГДЕЁЖЗИКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзиклмнопрстуфхцчшщъыьэюя'
number = '0123456789'
street = ['ul' , 'ул' , 'st' , 'pr-']
build = ['d' , 'д' , 'bld']

def count_number(row):
    res = 0
    for x in number:
        res += row.count(x)
    return res

def count_russian_letter(row):
    res = 0
    for x in russian_word:
        res += row.count(x)
    return res

def count_street_letter(row):
    res = 0
    for x in street:
        res += row.count(x)
    return res

def count_build_letter(row):
    res = 0
    for x in build:
        res += row.count(x)
    return res

def count_isupper(row):
    res = 0
    for x in row:
        res += x.isupper()
    return res

In [8]:
all_data = pd.concat([train , test])
all_data.drop('is_train' , axis=1 , inplace=True)
all_data['address'] = all_data['address'].astype(str)
all_data['len_address'] = all_data['address'].apply(len)
all_data['gap_count_address'] = all_data['address'].apply(lambda x: x.count(' '))
all_data['._count_address'] = all_data['address'].apply(lambda x: x.count('.'))
all_data[',_count_address'] = all_data['address'].apply(lambda x: x.count(','))
all_data['/_count_address'] = all_data['address'].apply(lambda x: x.count('/'))
all_data['\_count_address'] = all_data['address'].apply(lambda x: x.count('\''))
all_data['count_number_address'] = all_data['address'].apply(count_number)
all_data['count_russian_word_address'] = all_data['address'].apply(count_russian_letter)
all_data['count_isupper_address'] = all_data['address'].apply(count_isupper)
all_data['count_ul_in_address'] = all_data['address'].apply(count_street_letter)

### Features from data

In [9]:
all_data['transaction_date'] = pd.to_datetime(all_data['transaction_date'])
all_data['transaction_date'] = all_data[['transaction_date']].fillna(all_data['transaction_date'].min())

all_data['month'] = all_data['transaction_date'].dt.month.astype('int8')
all_data['week'] = all_data['transaction_date'].dt.week.astype('int8')
all_data['day'] = all_data['transaction_date'].dt.day.astype('int8')
all_data['day_of_week'] = all_data['transaction_date'].dt.dayofweek.astype('int8')
all_data['day_of_year'] = all_data['transaction_date'].dt.dayofyear.astype('int16')

### Distance to median

In [10]:
tmp = all_data.groupby(['customer_id'])['address_lat' , 'address_lon'].median().reset_index()
all_data = pd.merge(all_data , tmp , on=['customer_id'] , how='left' , suffixes=('' , '_median'))
all_data['dist_to_median'] = dist(all_data , 'address_lat' , 'address_lat_median',
                                             'address_lon' , 'address_lon_median')

In [11]:
count , diff = np.histogram(all_data.amount , 5)
all_data['amount_cat'] = 0
for i in range(len(diff) - 1):
    all_data.loc[(all_data['amount'] > diff[i]) & (all_data['amount'] < diff[i+1]) , 'amount_cat'] = i

### Distance to median group by categorical features

In [12]:
for col in tqdm_notebook(['month' , 'week' , 'day' , 'day_of_week' , 'mcc' , 'len_address' ,  'address_lat_count' , 
                         'amount_cat']):
    tmp = all_data.groupby(['customer_id' , col])['address_lat' , 'address_lon'].median().reset_index()
    all_data = pd.merge(all_data , tmp , on=['customer_id' , col] , how='left' , suffixes=('' , '_median_' + col))
    all_data['dist_to_' + col] = dist(all_data , 'address_lat' , 'address_lat_median_' + col,
                                                           'address_lon' , 'address_lon_median_' + col)




In [13]:
all_data['city_enc'] = all_data['city'].factorize()[0]
all_data['country'] = all_data['country'].factorize()[0]

In [14]:
train = all_data[all_data['target_home'].notnull()]
score = valid(train)
print (score.mean() , score.std())


0.37853677471259173 0.01056588179690113


In [15]:
test = all_data[all_data['target_home'].isnull()]

In [16]:
%%time

columns = get_columns(train)
    
model_home = LGBMClassifier(**model_params).fit(train[columns] , train['target_home'])
model_work = LGBMClassifier(**model_params).fit(train[columns] , train['target_work'])

model = {'home':model_home , 'work':model_work}

CPU times: user 4min 34s, sys: 2.05 s, total: 4min 36s
Wall time: 40.9 s


In [17]:
%%time
predict_test = pd.merge(predict(model['work'] , test , columns , 'work') , predict(model['home'] , test , columns , 'home') , on='customer_id' , how='left')
predict_test.columns = sample.columns
predict_test.replace({0 : np.nan} , inplace=True)
predict_test.fillna(predict_test.median() , inplace=True)
predict_test.to_csv('predict_good.csv' , index=False)

CPU times: user 36.9 s, sys: 2.36 s, total: 39.2 s
Wall time: 16.7 s
