In [1]:
import pandas as pd
import numpy as np
import re
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 300)

In [2]:
def predict(model , data , columns):
    data = data.copy()
    predict_answer = {}
    for col in ['work' , 'home']:
        data['predict_' + col] = model[col].predict_proba(data[columns])[:,1]
        tmp = data.groupby(['customer_id' , 'pos_address_lat' , 'pos_address_lon'])[['predict_' + col]].max()
        tmp = tmp.groupby(['customer_id']).idxmax()['predict_' + col].values
        
        predict = [np.array(x) for x in tmp]
        predict = pd.DataFrame(predict , columns=['customer_id' , col + '_predict_lat' ,  col + '_predict_lon'])
        predict_answer[col] = predict.convert_objects(convert_numeric=True)
        
    return pd.merge(predict_answer['work'] , predict_answer['home'] , on='customer_id')

In [3]:
train = pd.read_csv("data/train_set.csv.gz", compression="gzip").rename(columns={"pos_adress_lat": "pos_address_lat","pos_adress_lon": "pos_address_lon"})
test = pd.read_csv("data/test_set.csv.gz", compression="gzip")
sample = pd.read_csv('data/sample.csv')

In [4]:
def clean_mcc(mcc):
    if type(mcc) == int:
        return mcc
    mcc = mcc.split(',')
    if len(mcc) == 1:
        return int(mcc[0])
    else:
        return 1000*int(mcc[0]) + int(mcc[1])
test['mcc'] = test['mcc'].apply(clean_mcc)

In [5]:
### Target
train['target_home'] = (np.sqrt((train['home_add_lat'] - train['pos_address_lat']) ** 2 + (train['home_add_lon'] - train['pos_address_lon']) ** 2) < 0.02).astype('int8')
train['target_work'] = (np.sqrt((train['work_add_lat'] - train['pos_address_lat']) ** 2 + (train['work_add_lon'] - train['pos_address_lon']) ** 2) < 0.02).astype('int8')

In [6]:
good_col = ['amount', 'atm_address_lat', 'atm_address_lon', 'currency', 'mcc', 'pos_address_lat','pos_address_lon']

In [7]:
%%time
model_home = LGBMClassifier(n_jobs=-1)
model_work = LGBMClassifier(n_jobs=-1)
model_home.fit(train[good_col] , train['target_home'])
model_work.fit(train[good_col] , train['target_work'])

model = {'home':model_home , 'work':model_work}

CPU times: user 50.2 s, sys: 237 ms, total: 50.4 s
Wall time: 6.78 s


In [8]:
predict_test = sample.merge(predict(model , test , good_col) , how='left').drop(sample.columns[1:] , axis=1)
predict_test = predict_test.fillna(predict_test.median())
predict_test.columns = sample.columns
predict_test.to_csv('first.csv' , index=False)